diff --git a/.bazelrc b/.bazelrc
index 310eb293389d..67b5bfe76b92 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -1,6 +1,7 @@
 build --copt=--std=c++14
 build --copt=-I.
 build --copt=-isystem --copt bazel-out/k8-fastbuild/bin
+build --experimental_ui_max_stdouterr_bytes=2048576
 
 # Configuration to disable tty features for environments like CI
 build:no-tty --curses no
@@ -11,3 +12,8 @@ build:no-tty --show_progress_rate_limit 10
 build:gpu --define=cuda=true
 # define a separate build folder for faster switching between configs
 build:gpu --platform_suffix=-gpu
+# rules_cuda configuration
+build:gpu --@rules_cuda//cuda:enable_cuda
+build:gpu --@rules_cuda//cuda:cuda_targets=sm_52
+build:gpu --@rules_cuda//cuda:compiler=nvcc
+build:gpu --repo_env=CUDA_PATH=/usr/local/cuda
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 1899429e0bb4..7d428014cd79 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1 +1 @@
-Fixes #{issue number}
+Fixes #ISSUE_NUMBER
diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json
index f822adf6c4b5..6439e1c0416f 100644
--- a/.github/generated-ciflow-ruleset.json
+++ b/.github/generated-ciflow-ruleset.json
@@ -20,13 +20,13 @@
       "linux-docs-push",
       "linux-vulkan-bionic-py3.6-clang9",
       "linux-xenial-cuda11.3-py3.6-gcc7",
+      "linux-xenial-cuda11.3-py3.6-gcc7-bazel-test",
       "linux-xenial-py3-clang5-mobile-build",
       "linux-xenial-py3-clang5-mobile-custom-build-static",
       "linux-xenial-py3.6-clang7-asan",
       "linux-xenial-py3.6-clang7-onnx",
       "linux-xenial-py3.6-gcc5.4",
       "linux-xenial-py3.6-gcc7",
-      "linux-xenial-py3.6-gcc7-bazel-test",
       "macos-10-15-py3-arm64",
       "macos-10-15-py3-lite-interpreter-x86-64",
       "macos-11-py3-x86-64",
@@ -48,7 +48,7 @@
       "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
     ],
     "ciflow/bazel": [
-      "linux-xenial-py3.6-gcc7-bazel-test"
+      "linux-xenial-cuda11.3-py3.6-gcc7-bazel-test"
     ],
     "ciflow/cpu": [
       "caffe2-linux-xenial-py3.6-gcc5.4",
@@ -56,11 +56,11 @@
       "linux-docs",
       "linux-docs-push",
       "linux-vulkan-bionic-py3.6-clang9",
+      "linux-xenial-cuda11.3-py3.6-gcc7-bazel-test",
       "linux-xenial-py3.6-clang7-asan",
       "linux-xenial-py3.6-clang7-onnx",
       "linux-xenial-py3.6-gcc5.4",
       "linux-xenial-py3.6-gcc7",
-      "linux-xenial-py3.6-gcc7-bazel-test",
       "parallelnative-linux-xenial-py3.6-gcc5.4",
       "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single",
       "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit",
@@ -85,13 +85,13 @@
       "linux-docs",
       "linux-vulkan-bionic-py3.6-clang9",
       "linux-xenial-cuda11.3-py3.6-gcc7",
+      "linux-xenial-cuda11.3-py3.6-gcc7-bazel-test",
       "linux-xenial-py3-clang5-mobile-build",
       "linux-xenial-py3-clang5-mobile-custom-build-static",
       "linux-xenial-py3.6-clang7-asan",
       "linux-xenial-py3.6-clang7-onnx",
       "linux-xenial-py3.6-gcc5.4",
       "linux-xenial-py3.6-gcc7",
-      "linux-xenial-py3.6-gcc7-bazel-test",
       "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single",
       "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit",
       "win-vs2019-cpu-py3",
@@ -126,13 +126,13 @@
       "linux-docs-push",
       "linux-vulkan-bionic-py3.6-clang9",
       "linux-xenial-cuda11.3-py3.6-gcc7",
+      "linux-xenial-cuda11.3-py3.6-gcc7-bazel-test",
       "linux-xenial-py3-clang5-mobile-build",
       "linux-xenial-py3-clang5-mobile-custom-build-static",
       "linux-xenial-py3.6-clang7-asan",
       "linux-xenial-py3.6-clang7-onnx",
       "linux-xenial-py3.6-gcc5.4",
       "linux-xenial-py3.6-gcc7",
-      "linux-xenial-py3.6-gcc7-bazel-test",
       "parallelnative-linux-xenial-py3.6-gcc5.4",
       "periodic-libtorch-linux-bionic-cuda11.5-py3.6-gcc7",
       "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
@@ -185,6 +185,40 @@
     "ciflow/slow-gradcheck": [
       "periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck"
     ],
+    "ciflow/trunk": [
+      "caffe2-linux-xenial-py3.6-gcc5.4",
+      "docker-builds",
+      "ios-12-5-1-arm64",
+      "ios-12-5-1-arm64-coreml",
+      "ios-12-5-1-arm64-custom-ops",
+      "ios-12-5-1-arm64-full-jit",
+      "ios-12-5-1-arm64-metal",
+      "ios-12-5-1-x86-64",
+      "ios-12-5-1-x86-64-coreml",
+      "ios-12-5-1-x86-64-full-jit",
+      "libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
+      "libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
+      "linux-bionic-cuda10.2-py3.9-gcc7",
+      "linux-bionic-py3.6-clang9",
+      "linux-docs",
+      "linux-vulkan-bionic-py3.6-clang9",
+      "linux-xenial-cuda11.3-py3.6-gcc7",
+      "linux-xenial-cuda11.3-py3.6-gcc7-bazel-test",
+      "linux-xenial-py3-clang5-mobile-build",
+      "linux-xenial-py3-clang5-mobile-custom-build-static",
+      "linux-xenial-py3.6-clang7-asan",
+      "linux-xenial-py3.6-clang7-onnx",
+      "linux-xenial-py3.6-gcc5.4",
+      "linux-xenial-py3.6-gcc7",
+      "macos-10-15-py3-arm64",
+      "macos-10-15-py3-lite-interpreter-x86-64",
+      "macos-11-py3-x86-64",
+      "parallelnative-linux-xenial-py3.6-gcc5.4",
+      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single",
+      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit",
+      "win-vs2019-cpu-py3",
+      "win-vs2019-cuda11.3-py3"
+    ],
     "ciflow/vulkan": [
       "linux-vulkan-bionic-py3.6-clang9"
     ],
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index fe9633623547..0015c455c222 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -72,6 +72,7 @@ LABEL_CIFLOW_SLOW_GRADCHECK = "ciflow/slow-gradcheck"
 LABEL_CIFLOW_DOCKER = "ciflow/docker"
 LABEL_CIFLOW_IOS = "ciflow/ios"
 LABEL_CIFLOW_MACOS = "ciflow/macos"
+LABEL_CIFLOW_TRUNK = "ciflow/trunk"
 
 
 @dataclass
@@ -114,6 +115,8 @@ class CIFlowConfig:
 
     def __post_init__(self) -> None:
         self.labels.add(LABEL_CIFLOW_ALL)
+        if LABEL_CIFLOW_SCHEDULED not in self.labels:
+            self.labels.add(LABEL_CIFLOW_TRUNK)
         assert all(label.startswith(LABEL_CIFLOW_PREFIX) for label in self.labels)
         self.gen_root_job_condition()
 
@@ -224,6 +227,7 @@ class CIWorkflow:
             assert LABEL_CIFLOW_CPU in self.ciflow_config.labels
         if self.is_scheduled:
             assert LABEL_CIFLOW_DEFAULT not in self.ciflow_config.labels
+            assert LABEL_CIFLOW_TRUNK not in self.ciflow_config.labels
             assert LABEL_CIFLOW_SCHEDULED in self.ciflow_config.labels
         if self.build_with_debug:
             assert self.build_environment.endswith("-debug")
@@ -605,8 +609,8 @@ ANDROID_WORKFLOWS = [
 BAZEL_WORKFLOWS = [
     CIWorkflow(
         arch="linux",
-        build_environment="linux-xenial-py3.6-gcc7-bazel-test",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7",
+        build_environment="linux-xenial-cuda11.3-py3.6-gcc7-bazel-test",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7",
         test_runner_type=LINUX_CPU_TEST_RUNNER,
         ciflow_config=CIFlowConfig(
             labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BAZEL, LABEL_CIFLOW_CPU, LABEL_CIFLOW_LINUX},
diff --git a/.github/workflows/generated-caffe2-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-caffe2-linux-xenial-py3.6-gcc5.4.yml
index 81d7bdda452e..4427f56db6d1 100644
--- a/.github/workflows/generated-caffe2-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-caffe2-linux-xenial-py3.6-gcc5.4.yml
@@ -43,12 +43,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             (false))
          }}
     steps:
diff --git a/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml b/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml
index 6b5140ebf9f4..d1b105b8f608 100644
--- a/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml
+++ b/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml
@@ -29,12 +29,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             (false))
          }}
     steps:
diff --git a/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml b/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml
index 1fcb3f62548f..38a640b88700 100644
--- a/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml
+++ b/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml
@@ -29,12 +29,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             (false))
          }}
     steps:
diff --git a/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml b/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml
index 743fb50d30d6..e96213f6859c 100644
--- a/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml
+++ b/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml
@@ -29,12 +29,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             (false))
          }}
     steps:
diff --git a/.github/workflows/generated-ios-12-5-1-arm64-metal.yml b/.github/workflows/generated-ios-12-5-1-arm64-metal.yml
index e3523bac9929..c9ebc182e5ca 100644
--- a/.github/workflows/generated-ios-12-5-1-arm64-metal.yml
+++ b/.github/workflows/generated-ios-12-5-1-arm64-metal.yml
@@ -29,12 +29,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             (false))
          }}
     steps:
diff --git a/.github/workflows/generated-ios-12-5-1-arm64.yml b/.github/workflows/generated-ios-12-5-1-arm64.yml
index 642edaef8e12..5b1ce93aa6e0 100644
--- a/.github/workflows/generated-ios-12-5-1-arm64.yml
+++ b/.github/workflows/generated-ios-12-5-1-arm64.yml
@@ -29,12 +29,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             (false))
          }}
     steps:
diff --git a/.github/workflows/generated-ios-12-5-1-x86-64-coreml.yml b/.github/workflows/generated-ios-12-5-1-x86-64-coreml.yml
index 3c797272bc63..7de34ee9f9fe 100644
--- a/.github/workflows/generated-ios-12-5-1-x86-64-coreml.yml
+++ b/.github/workflows/generated-ios-12-5-1-x86-64-coreml.yml
@@ -29,12 +29,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             (false))
          }}
     steps:
diff --git a/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml b/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml
index 49c5c5a9d89f..c9b77d9c187b 100644
--- a/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml
+++ b/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml
@@ -29,12 +29,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             (false))
          }}
     steps:
diff --git a/.github/workflows/generated-ios-12-5-1-x86-64.yml b/.github/workflows/generated-ios-12-5-1-x86-64.yml
index ab896312adf7..a32aae9b53ff 100644
--- a/.github/workflows/generated-ios-12-5-1-x86-64.yml
+++ b/.github/workflows/generated-ios-12-5-1-x86-64.yml
@@ -29,12 +29,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/ios') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             (false))
          }}
     steps:
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 1d8a6d436bc9..cccbfe6aac48 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -43,12 +43,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             (false))
          }}
     steps:
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
index ca1eacb76d1a..e5b19d94f39b 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -43,12 +43,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             (false))
          }}
     steps:
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index ed6088c55a36..6182c692a4e8 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -43,12 +43,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository_owner == 'pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             (false))
          }}
     steps:
diff --git a/.github/workflows/generated-linux-bionic-py3.6-clang9.yml b/.github/workflows/generated-linux-bionic-py3.6-clang9.yml
index 24ea32c45df4..5817687c0d59 100644
--- a/.github/workflows/generated-linux-bionic-py3.6-clang9.yml
+++ b/.github/workflows/generated-linux-bionic-py3.6-clang9.yml
@@ -43,12 +43,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/noarch') || contains(github.event.pull_request.labels.*.name, 'ciflow/xla') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/noarch') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') || contains(github.event.pull_request.labels.*.name, 'ciflow/xla') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/noarch') || contains(github.event.pull_request.labels.*.name, 'ciflow/xla')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/noarch') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') || contains(github.event.pull_request.labels.*.name, 'ciflow/xla')) ||
             ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/')))
          }}
     steps:
diff --git a/.github/workflows/generated-linux-docs.yml b/.github/workflows/generated-linux-docs.yml
index a9dcc1a5d009..0fa49205cc59 100644
--- a/.github/workflows/generated-linux-docs.yml
+++ b/.github/workflows/generated-linux-docs.yml
@@ -43,12 +43,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/docs') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/docs') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/docs') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/docs') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/')))
          }}
     steps:
diff --git a/.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml b/.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml
index a68668acffa8..178a3bad0af7 100644
--- a/.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml
+++ b/.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml
@@ -43,12 +43,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/vulkan') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') || contains(github.event.pull_request.labels.*.name, 'ciflow/vulkan') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/vulkan')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') || contains(github.event.pull_request.labels.*.name, 'ciflow/vulkan')) ||
             ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/')))
          }}
     steps:
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7-bazel-test.yml
similarity index 95%
rename from .github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
rename to .github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7-bazel-test.yml
index ac02841fd832..515dc274ab7e 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7-bazel-test.yml
@@ -1,7 +1,7 @@
 # @generated DO NOT EDIT MANUALLY
 # Template is at:    .github/templates/bazel_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-xenial-py3.6-gcc7-bazel-test
+name: linux-xenial-cuda11.3-py3.6-gcc7-bazel-test
 
 on:
   pull_request:
@@ -14,8 +14,8 @@ on:
   workflow_dispatch:
 
 env:
-  BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc7-bazel-test
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7
+  BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.6-gcc7-bazel-test
+  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
   SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
   XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
   TORCH_CUDA_ARCH_LIST: 5.2
@@ -33,7 +33,7 @@ env:
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
   PYTORCH_RETRY_TEST_CASES: 1
 concurrency:
-  group: linux-xenial-py3.6-gcc7-bazel-test-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  group: linux-xenial-cuda11.3-py3.6-gcc7-bazel-test-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
@@ -43,12 +43,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/bazel') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/bazel') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/bazel') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/bazel') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/')))
          }}
     steps:
@@ -62,7 +62,7 @@ jobs:
     runs-on: linux.2xlarge
     needs: [ciflow_should_run]
     env:
-      JOB_BASE_NAME: linux-xenial-py3.6-gcc7-bazel-test-build-and-test
+      JOB_BASE_NAME: linux-xenial-cuda11.3-py3.6-gcc7-bazel-test-build-and-test
       NUM_TEST_SHARDS: 1
     steps:
       - name: Display EC2 information
@@ -302,7 +302,7 @@ jobs:
         env:
           AWS_DEFAULT_REGION: us-east-1
           BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: linux-xenial-py3.6-gcc7-bazel-test-test
+          JOB_BASE_NAME: linux-xenial-cuda11.3-py3.6-gcc7-bazel-test-test
           PR_NUMBER: ${{ github.event.pull_request.number }}
           SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           TAG: ${{ steps.parse-ref.outputs.tag }}
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 5d875c6d8e71..4eb048f23e70 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -43,12 +43,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/')))
          }}
     steps:
diff --git a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-build.yml b/.github/workflows/generated-linux-xenial-py3-clang5-mobile-build.yml
index 5ee190174e03..89fc5c17e984 100644
--- a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-build.yml
+++ b/.github/workflows/generated-linux-xenial-py3-clang5-mobile-build.yml
@@ -43,12 +43,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/mobile') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/mobile') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/mobile')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/mobile') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/')))
          }}
     steps:
diff --git a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-custom-build-static.yml b/.github/workflows/generated-linux-xenial-py3-clang5-mobile-custom-build-static.yml
index c4b46ddd24e7..6d7fcaed6b86 100644
--- a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-custom-build-static.yml
+++ b/.github/workflows/generated-linux-xenial-py3-clang5-mobile-custom-build-static.yml
@@ -43,12 +43,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/mobile') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/mobile') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/mobile')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/mobile') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/')))
          }}
     steps:
diff --git a/.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml b/.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml
index cefe299d7562..c92271b2592c 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml
@@ -43,12 +43,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/sanitizers') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/sanitizers') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/sanitizers')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/sanitizers') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/')))
          }}
     steps:
diff --git a/.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml b/.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml
index c3fa2c75dfbc..e4177958d3dd 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml
@@ -43,12 +43,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/onnx') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/onnx') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/onnx')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/onnx') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/')))
          }}
     steps:
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index a6680d0fd49e..375dcbcdbf24 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -43,12 +43,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository_owner == 'pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/')))
          }}
     steps:
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7.yml
index 40a1b900d9f2..94d84a20b191 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7.yml
@@ -43,12 +43,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository_owner == 'pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/')))
          }}
     steps:
diff --git a/.github/workflows/generated-macos-10-15-py3-arm64.yml b/.github/workflows/generated-macos-10-15-py3-arm64.yml
index 3fa0f0cacf72..64108c2da72c 100644
--- a/.github/workflows/generated-macos-10-15-py3-arm64.yml
+++ b/.github/workflows/generated-macos-10-15-py3-arm64.yml
@@ -32,12 +32,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             (false))
          }}
     steps:
diff --git a/.github/workflows/generated-macos-10-15-py3-lite-interpreter-x86-64.yml b/.github/workflows/generated-macos-10-15-py3-lite-interpreter-x86-64.yml
index c54322afefa7..d98e6874da61 100644
--- a/.github/workflows/generated-macos-10-15-py3-lite-interpreter-x86-64.yml
+++ b/.github/workflows/generated-macos-10-15-py3-lite-interpreter-x86-64.yml
@@ -34,12 +34,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             (false))
          }}
     steps:
diff --git a/.github/workflows/generated-macos-11-py3-x86-64.yml b/.github/workflows/generated-macos-11-py3-x86-64.yml
index 1ef99b1f0935..23542a870ca4 100644
--- a/.github/workflows/generated-macos-11-py3-x86-64.yml
+++ b/.github/workflows/generated-macos-11-py3-x86-64.yml
@@ -34,12 +34,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/macos') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             (false))
          }}
     steps:
diff --git a/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
index a7faee91c0d9..4ba6bf59a999 100644
--- a/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
@@ -43,12 +43,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             (false))
          }}
     steps:
diff --git a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit.yml b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit.yml
index b92817a2dfb8..93f08e024bc4 100644
--- a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit.yml
+++ b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit.yml
@@ -43,12 +43,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/android') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/android') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/android') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/android') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/')))
          }}
     steps:
diff --git a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single.yml b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single.yml
index 010c1d8d3167..faa935a61b65 100644
--- a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single.yml
+++ b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single.yml
@@ -43,12 +43,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/android') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/android') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository == 'pytorch/pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/android') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/android') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk')) ||
             ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/')))
          }}
     steps:
diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml
index d9f064b46bd4..5392555ff08e 100644
--- a/.github/workflows/generated-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml
@@ -47,12 +47,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') || contains(github.event.pull_request.labels.*.name, 'ciflow/win') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository_owner == 'pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) ||
             ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/')))
          }}
     steps:
diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
index e38c713d0e54..3bb47d35fa94 100644
--- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
@@ -48,12 +48,12 @@ jobs:
     timeout-minutes: 240
     env:
       IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') || contains(github.event.pull_request.labels.*.name, 'ciflow/win') }}
       LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     if: ${{ (github.repository_owner == 'pytorch') && (
             (github.event_name == 'push') ||
             (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) ||
             ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/')))
          }}
     steps:
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 813a9710873b..49c41a831bed 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -207,11 +207,10 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
 
   get_bazel
 
-  # first build the whole torch for CPU-only
+  # first build torch for CPU-only
   tools/bazel build --config=no-tty :torch
-  # then build selected set of targets with GPU-support.
-  # TODO: eventually this should converge to building the whole :torch with GPU-support
-  tools/bazel build --config=no-tty --config=gpu //c10
+  # then build everything with CUDA
+  tools/bazel build --config=no-tty --config=gpu :all
 else
   # check that setup.py would fail with bad arguments
   echo "The next three invocations are expected to fail with invalid command error messages."
diff --git a/BUILD.bazel b/BUILD.bazel
index 23fe73cef91e..b65d77b882b8 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -3,7 +3,7 @@ load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
 load("@rules_proto//proto:defs.bzl", "proto_library")
 load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_proto_library", "cc_test")
 load("//third_party:substitution.bzl", "header_template_rule")
-load("//:tools/build_variables.bzl", "torch_cpp_srcs", "libtorch_python_core_sources", "libtorch_core_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "jit_core_sources")
+load("//:tools/build_variables.bzl", "jit_core_sources", "libtorch_core_sources", "libtorch_cuda_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "libtorch_nvfuser_generated_headers", "libtorch_nvfuser_runtime_sources", "libtorch_python_core_sources", "torch_cpp_srcs")
 load("//tools/rules:cu.bzl", "cu_library")
 load("//tools/config:defs.bzl", "if_cuda")
 load("//:aten.bzl", "intern_build_aten_ops", "generate_aten")
@@ -15,6 +15,7 @@ COMMON_COPTS = [
     "-DHAVE_SHM_UNLINK=1",
     "-D_FILE_OFFSET_BITS=64",
     "-DHAVE_GCC_GET_CPUID",
+    "-DTH_BLAS_MKL",
     "-DUSE_GCC_GET_CPUID",
     "-DTH_HAVE_THREAD",
     "-DUSE_FBGEMM",
@@ -37,11 +38,11 @@ py_binary(
     ],
 )
 
+aten_generation_srcs = ["aten/src/ATen/native/native_functions.yaml"] + glob(["aten/src/ATen/templates/**"])
+
 generate_aten(
     name = "generated_cpp",
-    srcs = [
-        "aten/src/ATen/native/native_functions.yaml",
-    ] + glob(["aten/src/ATen/templates/**"]),
+    srcs = aten_generation_srcs,
     outs = [
         "aten/src/ATen/Declarations.yaml",
         "aten/src/ATen/RegisterBackendSelect.cpp",
@@ -62,8 +63,6 @@ generate_aten(
         "aten/src/ATen/RegisterSchema.cpp",
         "aten/src/ATen/CPUFunctions.h",
         "aten/src/ATen/CPUFunctions_inl.h",
-        "aten/src/ATen/CUDAFunctions.h",
-        "aten/src/ATen/CUDAFunctions_inl.h",
         "aten/src/ATen/CompositeExplicitAutogradFunctions.h",
         "aten/src/ATen/CompositeExplicitAutogradFunctions_inl.h",
         "aten/src/ATen/CompositeImplicitAutogradFunctions.h",
@@ -82,6 +81,8 @@ generate_aten(
         "aten/src/ATen/MetaFunctions.h",
         "aten/src/ATen/MetaFunctions_inl.h",
         "aten/src/ATen/MethodOperators.h",
+        "aten/src/ATen/NativeMetaFunctions.h",
+        "aten/src/ATen/RegistrationDeclarations.h",
         "aten/src/ATen/core/TensorBody.h",
         "aten/src/ATen/core/TensorMethods.cpp",
         "aten/src/ATen/core/ATenOpList.cpp",
@@ -89,6 +90,23 @@ generate_aten(
     generator=":gen",
 )
 
+# this hack is due to https://github.com/bazelbuild/bazel/issues/281
+# since `outs` cannot be configured with if_cuda, we rerun the same command and declare cuda related files separately here.
+genrule(
+    name = "generated_cuda_cpp",
+    srcs = aten_generation_srcs,
+    outs = [
+        "aten/src/ATen/CUDAFunctions.h",
+        "aten/src/ATen/CUDAFunctions_inl.h",
+        "aten/src/ATen/RegisterCUDA.cpp",
+        "aten/src/ATen/RegisterQuantizedCUDA.cpp",
+        "aten/src/ATen/RegisterSparseCUDA.cpp",
+        "aten/src/ATen/RegisterSparseCsrCUDA.cpp",
+    ],
+    cmd = "$(location :gen) --source-path `dirname $(location aten/src/ATen/native/native_functions.yaml)`/.. --install_dir `dirname $(location aten/src/ATen/RegisterCUDA.cpp)`",
+    tools = [":gen"],
+)
+
 py_library(
     name = "tools_codegen",
     srcs = glob(["tools/codegen/**/*.py"]),
@@ -230,7 +248,7 @@ filegroup(
 
 filegroup(
     name = "aten_native_mkl_cpp",
-    srcs = glob(["aten/src/ATen/native/mkl/*.cpp"]),
+    srcs = glob(["aten/src/ATen/native/mkl/*.cpp", "aten/src/ATen/mkl/*.cpp"]),
 )
 
 filegroup(
@@ -266,135 +284,40 @@ filegroup(
 )
 
 filegroup(
-    name = "aten_cuda_srcs",
-    srcs = [
-        "aten/src/ATen/cuda/CUDABlas.cpp",
-        "aten/src/ATen/cuda/CUDASolver.cpp",
-        "aten/src/ATen/cuda/CUDAContext.cpp",
-        "aten/src/ATen/cuda/CUDAGeneratorImpl.cpp",
-        "aten/src/ATen/cuda/CUDAGraph.cpp",
-        "aten/src/ATen/cuda/CuSparseHandlePool.cpp",
-        "aten/src/ATen/cuda/CublasHandlePool.cpp",
-        "aten/src/ATen/cuda/CusolverDnHandlePool.cpp",
-        "aten/src/ATen/cuda/PinnedMemoryAllocator.cpp",
-        "aten/src/ATen/cuda/detail/CUDAHooks.cpp",
-        "aten/src/ATen/cudnn/AutocastRNN.cpp",
-        "aten/src/ATen/cudnn/Descriptors.cpp",
-        "aten/src/ATen/cudnn/Handle.cpp",
-        "aten/src/ATen/cudnn/Types.cpp",
-        "aten/src/ATen/native/cuda/CUDAUnaryOps.cpp",
-        "aten/src/ATen/native/cuda/TensorShapeCUDA.cpp",
-        "aten/src/ATen/native/cudnn/AffineGridGenerator.cpp",
-        "aten/src/ATen/native/cudnn/BatchNorm.cpp",
-        "aten/src/ATen/native/cudnn/Conv.cpp",
-        "aten/src/ATen/native/cudnn/GridSampler.cpp",
-        "aten/src/ATen/native/cudnn/LossCTC.cpp",
-        "aten/src/ATen/native/cudnn/RNN.cpp",
-        "aten/src/ATen/native/miopen/BatchNorm_miopen.cpp",
-        "aten/src/ATen/native/miopen/Conv_miopen.cpp",
-        "aten/src/ATen/native/miopen/RNN_miopen.cpp",
-        "aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp",
-        "aten/src/ATen/native/sparse/cuda/SparseBlas.cpp",
-        "aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp",
-    ],
+    name = "aten_cuda_cpp_srcs",
+    srcs = glob(
+        [
+            "aten/src/ATen/cuda/*.cpp",
+            "aten/src/ATen/cuda/detail/*.cpp",
+            "aten/src/ATen/cudnn/*.cpp",
+            "aten/src/ATen/native/cuda/*.cpp",
+            "aten/src/ATen/native/cudnn/*.cpp",
+            "aten/src/ATen/native/miopen/*.cpp",
+            "aten/src/ATen/native/sparse/cuda/*.cpp",
+            "aten/src/THC/*.cpp",
+        ],
+    ),
 )
 
 filegroup(
-    name = "aten_srcs_cu",
-    srcs = [
-        "aten/src/ATen/cuda/cub.cu.cc",
-        "aten/src/ATen/cuda/detail/IndexUtils.cu.cc",
-        "aten/src/ATen/cuda/detail/CUDAGraphsUtils.cu.cc",
-        "aten/src/ATen/native/cuda/Activation.cu.cc",
-        "aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu.cc",
-        "aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu.cc",
-        "aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu.cc",
-        "aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu.cc",
-        "aten/src/ATen/native/cuda/AveragePool2d.cu.cc",
-        "aten/src/ATen/native/cuda/AveragePool3d.cu.cc",
-        "aten/src/ATen/native/cuda/BatchLinearAlgebra.cu.cc",
-        "aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu.cc",
-        "aten/src/ATen/native/cuda/BinaryArithmeticKernel.cu.cc",
-        "aten/src/ATen/native/cuda/BinaryCompareKernel.cu.cc",
-        "aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu.cc",
-        "aten/src/ATen/native/cuda/CUDAScalar.cu.cc",
-        "aten/src/ATen/native/cuda/Col2Im.cu.cc",
-        "aten/src/ATen/native/cuda/Copy.cu.cc",
-        "aten/src/ATen/native/cuda/CrossKernel.cu.cc",
-        "aten/src/ATen/native/cuda/DilatedMaxPool2d.cu.cc",
-        "aten/src/ATen/native/cuda/DilatedMaxPool3d.cu.cc",
-        "aten/src/ATen/native/cuda/DistanceKernel.cu.cc",
-        "aten/src/ATen/native/cuda/Distributions.cu.cc",
-        "aten/src/ATen/native/cuda/Dropout.cu.cc",
-        "aten/src/ATen/native/cuda/Embedding.cu.cc",
-        "aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu.cc",
-        "aten/src/ATen/native/cuda/EmbeddingBag.cu.cc",
-        "aten/src/ATen/native/cuda/FillKernel.cu.cc",
-        "aten/src/ATen/native/cuda/FractionalMaxPool2d.cu.cc",
-        "aten/src/ATen/native/cuda/FractionalMaxPool3d.cu.cc",
-        "aten/src/ATen/native/cuda/GridSampler.cu.cc",
-        "aten/src/ATen/native/cuda/Im2Col.cu.cc",
-        "aten/src/ATen/native/cuda/IndexKernel.cu.cc",
-        "aten/src/ATen/native/cuda/Indexing.cu.cc",
-        "aten/src/ATen/native/cuda/Lerp.cu.cc",
-        "aten/src/ATen/native/cuda/LinearAlgebra.cu.cc",
-        "aten/src/ATen/native/cuda/Loss.cu.cc",
-        "aten/src/ATen/native/cuda/LossCTC.cu.cc",
-        "aten/src/ATen/native/cuda/MaxUnpooling.cu.cc",
-        "aten/src/ATen/native/cuda/MultinomialKernel.cu.cc",
-        "aten/src/ATen/native/cuda/MultiLabelMarginCriterion.cu.cc",
-        "aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu.cc",
-        "aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu.cc",
-        "aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu.cc",
-        "aten/src/ATen/native/cuda/NLLLoss2d.cu.cc",
-        "aten/src/ATen/native/cuda/Normalization.cu.cc",
-        "aten/src/ATen/native/cuda/PointwiseOpsKernel.cu.cc",
-        "aten/src/ATen/native/cuda/PowKernel.cu.cc",
-        "aten/src/ATen/native/cuda/RNN.cu.cc",
-        "aten/src/ATen/native/cuda/RangeFactories.cu.cc",
-        "aten/src/ATen/native/cuda/Reduce.cu.cc",
-        "aten/src/ATen/native/cuda/ReduceOpsKernel.cu.cc",
-        "aten/src/ATen/native/cuda/ReflectionPad.cu.cc",
-        "aten/src/ATen/native/cuda/Repeat.cu.cc",
-        "aten/src/ATen/native/cuda/ReplicationPadding.cu.cc",
-        "aten/src/ATen/native/cuda/Resize.cu.cc",
-        "aten/src/ATen/native/cuda/SegmentReduce.cu.cc",
-        "aten/src/ATen/native/cuda/SoftMax.cu.cc",
-        "aten/src/ATen/native/cuda/SortingKthValue.cu.cc",
-        "aten/src/ATen/native/cuda/SparseMM.cu.cc",
-        "aten/src/ATen/native/cuda/SpectralOps.cu.cc",
-        "aten/src/ATen/native/cuda/SummaryOps.cu.cc",
-        "aten/src/ATen/native/cuda/TensorCompare.cu.cc",
-        "aten/src/ATen/native/cuda/TensorFactories.cu.cc",
-        "aten/src/ATen/native/cuda/TensorTopK.cu.cc",
-        "aten/src/ATen/native/cuda/TensorTransformations.cu.cc",
-        "aten/src/ATen/native/cuda/TriangularOps.cu.cc",
-        "aten/src/ATen/native/cuda/UnaryOpsKernel.cu.cc",
-        "aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu.cc",
-        "aten/src/ATen/native/cuda/Unique.cu.cc",
-        "aten/src/ATen/native/cuda/UpSampleBicubic2d.cu.cc",
-        "aten/src/ATen/native/cuda/UpSampleBilinear2d.cu.cc",
-        "aten/src/ATen/native/cuda/UpSampleLinear1d.cu.cc",
-        "aten/src/ATen/native/cuda/UpSampleNearest1d.cu.cc",
-        "aten/src/ATen/native/cuda/UpSampleNearest2d.cu.cc",
-        "aten/src/ATen/native/cuda/UpSampleNearest3d.cu.cc",
-        "aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu.cc",
-        "aten/src/ATen/native/cuda/WeightNorm.cu.cc",
-        "aten/src/ATen/native/cuda/layer_norm_kernel.cu.cc",
-        "aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu.cc",
-        "aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu.cc",
-        "aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu.cc",
-        "aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu.cc",
-    ],
+    name = "aten_cu_srcs",
+    srcs = glob([
+        "aten/src/ATen/cuda/*.cu",
+        "aten/src/ATen/cuda/detail/*.cu",
+        "aten/src/ATen/native/cuda/*.cu",
+        "aten/src/ATen/native/quantized/cuda/*.cu",
+        "aten/src/ATen/native/sparse/cuda/*.cu",
+    ]),
 )
 
 header_template_rule(
     name = "aten_src_ATen_config",
     src = "aten/src/ATen/Config.h.in",
     out = "aten/src/ATen/Config.h",
+    include = "aten/src",
     substitutions = {
         "@AT_MKLDNN_ENABLED@": "1",
-        "@AT_MKL_ENABLED@": "0",
+        "@AT_MKL_ENABLED@": "1",
         "@AT_FFTW_ENABLED@": "0",
         "@AT_POCKETFFT_ENABLED@": "0",
         "@AT_NNPACK_ENABLED@": "0",
@@ -413,6 +336,7 @@ header_template_rule(
     name = "aten_src_ATen_cuda_config",
     src = "aten/src/ATen/cuda/CUDAConfig.h.in",
     out = "aten/src/ATen/cuda/CUDAConfig.h",
+    include = "aten/src",
     substitutions = {
         "@AT_CUDNN_ENABLED@": "1",
         "@AT_ROCM_ENABLED@": "0",
@@ -429,18 +353,19 @@ cc_library(
     ] + glob([
         "aten/src/**/*.h",
         "aten/src/**/*.hpp",
+        "aten/src/ATen/cuda/**/*.cuh",
+        "aten/src/ATen/native/**/*.cuh",
         "aten/src/TH/**/*.cpp",
         "aten/src/THC/*.cuh",
+        "aten/src/THC/generic/*.cu",
     ],
-    exclude = [
-        "aten/src/ATen/Config.h",
-    ],) + [
-        ":generated_cpp",
+    ) + [
         ":aten_src_ATen_config",
+        ":generated_cpp",
+        ":generated_cuda_cpp",
     ],
     includes = [
         "aten/src",
-        "aten/src/TH",
     ],
     deps = [
         "//c10:headers",
@@ -464,6 +389,7 @@ intern_build_aten_ops(
         ":aten_headers",
         "@sleef",
         "@fbgemm",
+        "@mkl",
     ],
 )
 
@@ -530,12 +456,17 @@ cc_binary(
 
 cc_library(
     name = "aten_cuda_cpp",
-    srcs = [":aten_cuda_srcs"],
+    srcs = [
+        ":aten_cuda_cpp_srcs",
+        ":generated_cuda_cpp",
+    ],
+    hdrs = [":aten_src_ATen_cuda_config"],
     copts = ATEN_COPTS,
     visibility = ["//visibility:public"],
     deps = [
         ":aten",
         "@cuda",
+        "@cuda//:cusolver",
         "@cuda//:nvrtc",
         "@cudnn",
     ],
@@ -552,9 +483,7 @@ torch_cuda_half_options = [
 
 cu_library(
     name = "aten_cuda",
-    srcs = [
-        ":aten_srcs_cu",
-    ],
+    srcs = [":aten_cu_srcs"],
     copts = ATEN_COPTS + torch_cuda_half_options,
     visibility = ["//visibility:public"],
     deps = [
@@ -618,6 +547,7 @@ header_template_rule(
 filegroup(
     name = "caffe2_contrib_srcs",
     srcs = [
+        "caffe2/contrib/aten/aten_op.cc",
         "caffe2/contrib/gloo/allgather_ops.cc",
         "caffe2/contrib/gloo/allreduce_ops.cc",
         "caffe2/contrib/gloo/barrier_ops.cc",
@@ -787,6 +717,7 @@ filegroup(
         "caffe2/operators/conv_op_eigen.cc",
         "caffe2/operators/conv_op_shared.cc",
         "caffe2/operators/conv_transpose_gradient_op.cc",
+        "caffe2/operators/conv_transpose_op.cc",
         "caffe2/operators/conv_transpose_op_mobile.cc",
         "caffe2/operators/copy_op.cc",
         "caffe2/operators/copy_rows_to_tensor_op.cc",
@@ -1182,7 +1113,7 @@ filegroup(
 )
 
 filegroup(
-    name = "caffe2_cuda_srcs",
+    name = "caffe2_cuda_cpp_srcs",
     srcs = [
         "caffe2/contrib/aten/aten_op_gpu.cc",
         "caffe2/contrib/gloo/allreduce_ops_gpu.cc",
@@ -1251,155 +1182,155 @@ filegroup(
 filegroup(
     name = "caffe2_cu_srcs",
     srcs = [
-        "caffe2/core/context_gpu.cu.cc",
-        "caffe2/operators/abs_op.cu.cc",
-        "caffe2/operators/accumulate_op.cu.cc",
-        "caffe2/operators/accuracy_op.cu.cc",
-        "caffe2/operators/acos_op.cu.cc",
-        "caffe2/operators/affine_channel_op.cu.cc",
-        "caffe2/operators/alias_with_name.cu.cc",
-        "caffe2/operators/arg_ops.cu.cc",
-        "caffe2/operators/asin_op.cu.cc",
-        "caffe2/operators/assert_op.cu.cc",
-        "caffe2/operators/atan_op.cu.cc",
-        "caffe2/operators/batch_gather_ops.cu.cc",
-        "caffe2/operators/batch_matmul_op.cu.cc",
-        "caffe2/operators/batch_moments_op.cu.cc",
-        "caffe2/operators/batch_permutation_op.cu.cc",
-        "caffe2/operators/batch_sparse_to_dense_op.cu.cc",
-        "caffe2/operators/boolean_mask_ops.cu.cc",
-        "caffe2/operators/boolean_unmask_ops.cu.cc",
-        "caffe2/operators/bucketize_op.cu.cc",
-        "caffe2/operators/cast_op.cu.cc",
-        "caffe2/operators/cbrt_op.cu.cc",
-        "caffe2/operators/ceil_op.cu.cc",
-        "caffe2/operators/channel_backprop_stats_op.cu.cc",
-        "caffe2/operators/channel_shuffle_op.cu.cc",
-        "caffe2/operators/channel_stats_op.cu.cc",
-        "caffe2/operators/channelwise_conv3d_op_cudnn.cu.cc",
-        "caffe2/operators/clip_op.cu.cc",
-        "caffe2/operators/copy_op.cu.cc",
-        "caffe2/operators/cos_op.cu.cc",
-        "caffe2/operators/cosh_op.cu.cc",
-        "caffe2/operators/cosine_embedding_criterion_op.cu.cc",
-        "caffe2/operators/cross_entropy_op.cu.cc",
-        "caffe2/operators/cube_op.cu.cc",
-        "caffe2/operators/data_couple_gpu.cu.cc",
-        "caffe2/operators/deform_conv_op.cu.cc",
-        "caffe2/operators/depthwise_3x3_conv_op_cudnn.cu.cc",
-        "caffe2/operators/distance_op.cu.cc",
-        "caffe2/operators/dropout_op.cu.cc",
-        "caffe2/operators/elementwise_div_op.cu.cc",
-        "caffe2/operators/elementwise_linear_op.cu.cc",
-        "caffe2/operators/elementwise_mul_op.cu.cc",
-        "caffe2/operators/elementwise_ops.cu.cc",
-        "caffe2/operators/elu_op.cu.cc",
-        "caffe2/operators/enforce_finite_op.cu.cc",
-        "caffe2/operators/ensure_cpu_output_op.cu.cc",
-        "caffe2/operators/erf_op.cu.cc",
-        "caffe2/operators/filler_op.cu.cc",
-        "caffe2/operators/find_op.cu.cc",
-        "caffe2/operators/floor_op.cu.cc",
-        "caffe2/operators/gather_op.cu.cc",
-        "caffe2/operators/gelu_op.cu.cc",
-        "caffe2/operators/generate_proposals_op.cu.cc",
-        "caffe2/operators/generate_proposals_op_util_nms_gpu.cu.cc",
-        "caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.cu.cc",
-        "caffe2/operators/given_tensor_fill_op.cu.cc",
-        "caffe2/operators/glu_op.cu.cc",
-        "caffe2/operators/group_norm_op.cu.cc",
-        "caffe2/operators/gru_unit_op_gpu.cu.cc",
-        "caffe2/operators/half_float_ops.cu.cc",
-        "caffe2/operators/hard_sigmoid_op.cu.cc",
-        "caffe2/operators/instance_norm_op.cu.cc",
-        "caffe2/operators/integral_image_op.cu.cc",
-        "caffe2/operators/layer_norm_op.cu.cc",
-        "caffe2/operators/leaky_relu_op.cu.cc",
-        "caffe2/operators/lengths_pad_op.cu.cc",
-        "caffe2/operators/lengths_tile_op.cu.cc",
-        "caffe2/operators/local_response_normalization_op.cu.cc",
-        "caffe2/operators/logit_op.cu.cc",
-        "caffe2/operators/loss_op.cu.cc",
-        "caffe2/operators/lp_pool_op.cu.cc",
-        "caffe2/operators/lstm_unit_op_gpu.cu.cc",
-        "caffe2/operators/margin_ranking_criterion_op.cu.cc",
-        "caffe2/operators/max_pool_with_index.cu.cc",
-        "caffe2/operators/mean_op.cu.cc",
-        "caffe2/operators/mem_query_op.cu.cc",
-        "caffe2/operators/minmax_ops.cu.cc",
-        "caffe2/operators/moments_op.cu.cc",
-        "caffe2/operators/multi_class_accuracy_op.cu.cc",
-        "caffe2/operators/normalize_ops.cu.cc",
-        "caffe2/operators/one_hot_ops.cu.cc",
-        "caffe2/operators/pack_segments.cu.cc",
-        "caffe2/operators/pad_op_gpu.cu.cc",
-        "caffe2/operators/perplexity_op.cu.cc",
-        "caffe2/operators/piecewise_linear_transform_op.cu.cc",
-        "caffe2/operators/pool_op.cu.cc",
-        "caffe2/operators/pow_op.cu.cc",
-        "caffe2/operators/prelu_op.cu.cc",
-        "caffe2/operators/reciprocal_op.cu.cc",
-        "caffe2/operators/reduce_front_back_max_ops.cu.cc",
-        "caffe2/operators/reduce_front_back_sum_mean_ops.cu.cc",
-        "caffe2/operators/reduce_ops.cu.cc",
-        "caffe2/operators/reduction_ops.cu.cc",
-        "caffe2/operators/relu_n_op.cu.cc",
-        "caffe2/operators/relu_op.cu.cc",
-        "caffe2/operators/replace_nan_op.cu.cc",
-        "caffe2/operators/resize_3d_op.cu.cc",
-        "caffe2/operators/resize_op.cu.cc",
-        "caffe2/operators/reverse_packed_segs_op.cu.cc",
-        "caffe2/operators/rmac_regions_op.cu.cc",
-        "caffe2/operators/rnn/recurrent_network_op_gpu.cu.cc",
-        "caffe2/operators/roi_align_gradient_op.cu.cc",
-        "caffe2/operators/roi_align_op.cu.cc",
-        "caffe2/operators/roi_align_rotated_gradient_op.cu.cc",
-        "caffe2/operators/roi_align_rotated_op.cu.cc",
-        "caffe2/operators/roi_pool_op.cu.cc",
-        "caffe2/operators/rsqrt_op.cu.cc",
-        "caffe2/operators/scale_blobs_op.cu.cc",
-        "caffe2/operators/segment_reduction_op_gpu.cu.cc",
-        "caffe2/operators/selu_op.cu.cc",
-        "caffe2/operators/sequence_ops.cu.cc",
-        "caffe2/operators/sigmoid_op.cu.cc",
-        "caffe2/operators/sin_op.cu.cc",
-        "caffe2/operators/sinh_op.cu.cc",
-        "caffe2/operators/slice_op.cu.cc",
-        "caffe2/operators/softmax_ops.cu.cc",
-        "caffe2/operators/softplus_op.cu.cc",
-        "caffe2/operators/softsign_op.cu.cc",
-        "caffe2/operators/space_batch_op_gpu.cu.cc",
-        "caffe2/operators/sparse_normalize_op_gpu.cu.cc",
-        "caffe2/operators/sparse_to_dense_op.cu.cc",
-        "caffe2/operators/spatial_batch_norm_op.cu.cc",
-        "caffe2/operators/spatial_batch_norm_op_cudnn.cu.cc",
-        "caffe2/operators/stump_func_op.cu.cc",
-        "caffe2/operators/summarize_op.cu.cc",
-        "caffe2/operators/swish_op.cu.cc",
-        "caffe2/operators/tan_op.cu.cc",
-        "caffe2/operators/tanh_op.cu.cc",
-        "caffe2/operators/thresholded_relu_op.cu.cc",
-        "caffe2/operators/tile_op.cu.cc",
-        "caffe2/operators/top_k.cu.cc",
-        "caffe2/operators/transpose_op.cu.cc",
-        "caffe2/operators/unique_ops.cu.cc",
-        "caffe2/operators/upsample_op.cu.cc",
-        "caffe2/operators/utility_ops.cu.cc",
-        "caffe2/operators/weighted_sample_op.cu.cc",
-        "caffe2/sgd/adadelta_op_gpu.cu.cc",
-        "caffe2/sgd/adagrad_op_gpu.cu.cc",
-        "caffe2/sgd/adam_op_gpu.cu.cc",
-        "caffe2/sgd/fp16_momentum_sgd_op.cu.cc",
-        "caffe2/sgd/fp32_momentum_sgd_op.cu.cc",
-        "caffe2/sgd/lars_op_gpu.cu.cc",
-        "caffe2/sgd/momentum_sgd_op_gpu.cu.cc",
-        "caffe2/sgd/rmsprop_op_gpu.cu.cc",
-        "caffe2/sgd/yellowfin_op_gpu.cu.cc",
-        "caffe2/utils/math/broadcast.cu.cc",
-        "caffe2/utils/math/elementwise.cu.cc",
-        "caffe2/utils/math/reduce.cu.cc",
-        "caffe2/utils/math/transpose.cu.cc",
-        "caffe2/utils/math_gpu.cu.cc",
+        "caffe2/core/context_gpu.cu",
+        "caffe2/operators/abs_op.cu",
+        "caffe2/operators/accumulate_op.cu",
+        "caffe2/operators/accuracy_op.cu",
+        "caffe2/operators/acos_op.cu",
+        "caffe2/operators/affine_channel_op.cu",
+        "caffe2/operators/alias_with_name.cu",
+        "caffe2/operators/arg_ops.cu",
+        "caffe2/operators/asin_op.cu",
+        "caffe2/operators/assert_op.cu",
+        "caffe2/operators/atan_op.cu",
+        "caffe2/operators/batch_gather_ops.cu",
+        "caffe2/operators/batch_matmul_op.cu",
+        "caffe2/operators/batch_moments_op.cu",
+        "caffe2/operators/batch_permutation_op.cu",
+        "caffe2/operators/batch_sparse_to_dense_op.cu",
+        "caffe2/operators/boolean_mask_ops.cu",
+        "caffe2/operators/boolean_unmask_ops.cu",
+        "caffe2/operators/bucketize_op.cu",
+        "caffe2/operators/cast_op.cu",
+        "caffe2/operators/cbrt_op.cu",
+        "caffe2/operators/ceil_op.cu",
+        "caffe2/operators/channel_backprop_stats_op.cu",
+        "caffe2/operators/channel_shuffle_op.cu",
+        "caffe2/operators/channel_stats_op.cu",
+        "caffe2/operators/channelwise_conv3d_op_cudnn.cu",
+        "caffe2/operators/clip_op.cu",
+        "caffe2/operators/copy_op.cu",
+        "caffe2/operators/cos_op.cu",
+        "caffe2/operators/cosh_op.cu",
+        "caffe2/operators/cosine_embedding_criterion_op.cu",
+        "caffe2/operators/cross_entropy_op.cu",
+        "caffe2/operators/cube_op.cu",
+        "caffe2/operators/data_couple_gpu.cu",
+        "caffe2/operators/deform_conv_op.cu",
+        "caffe2/operators/depthwise_3x3_conv_op_cudnn.cu",
+        "caffe2/operators/distance_op.cu",
+        "caffe2/operators/dropout_op.cu",
+        "caffe2/operators/elementwise_div_op.cu",
+        "caffe2/operators/elementwise_linear_op.cu",
+        "caffe2/operators/elementwise_mul_op.cu",
+        "caffe2/operators/elementwise_ops.cu",
+        "caffe2/operators/elu_op.cu",
+        "caffe2/operators/enforce_finite_op.cu",
+        "caffe2/operators/ensure_cpu_output_op.cu",
+        "caffe2/operators/erf_op.cu",
+        "caffe2/operators/filler_op.cu",
+        "caffe2/operators/find_op.cu",
+        "caffe2/operators/floor_op.cu",
+        "caffe2/operators/gather_op.cu",
+        "caffe2/operators/gelu_op.cu",
+        "caffe2/operators/generate_proposals_op.cu",
+        "caffe2/operators/generate_proposals_op_util_nms_gpu.cu",
+        "caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.cu",
+        "caffe2/operators/given_tensor_fill_op.cu",
+        "caffe2/operators/glu_op.cu",
+        "caffe2/operators/group_norm_op.cu",
+        "caffe2/operators/gru_unit_op_gpu.cu",
+        "caffe2/operators/half_float_ops.cu",
+        "caffe2/operators/hard_sigmoid_op.cu",
+        "caffe2/operators/instance_norm_op.cu",
+        "caffe2/operators/integral_image_op.cu",
+        "caffe2/operators/layer_norm_op.cu",
+        "caffe2/operators/leaky_relu_op.cu",
+        "caffe2/operators/lengths_pad_op.cu",
+        "caffe2/operators/lengths_tile_op.cu",
+        "caffe2/operators/local_response_normalization_op.cu",
+        "caffe2/operators/logit_op.cu",
+        "caffe2/operators/loss_op.cu",
+        "caffe2/operators/lp_pool_op.cu",
+        "caffe2/operators/lstm_unit_op_gpu.cu",
+        "caffe2/operators/margin_ranking_criterion_op.cu",
+        "caffe2/operators/max_pool_with_index.cu",
+        "caffe2/operators/mean_op.cu",
+        "caffe2/operators/mem_query_op.cu",
+        "caffe2/operators/minmax_ops.cu",
+        "caffe2/operators/moments_op.cu",
+        "caffe2/operators/multi_class_accuracy_op.cu",
+        "caffe2/operators/normalize_ops.cu",
+        "caffe2/operators/one_hot_ops.cu",
+        "caffe2/operators/pack_segments.cu",
+        "caffe2/operators/pad_op_gpu.cu",
+        "caffe2/operators/perplexity_op.cu",
+        "caffe2/operators/piecewise_linear_transform_op.cu",
+        "caffe2/operators/pool_op.cu",
+        "caffe2/operators/pow_op.cu",
+        "caffe2/operators/prelu_op.cu",
+        "caffe2/operators/reciprocal_op.cu",
+        "caffe2/operators/reduce_front_back_max_ops.cu",
+        "caffe2/operators/reduce_front_back_sum_mean_ops.cu",
+        "caffe2/operators/reduce_ops.cu",
+        "caffe2/operators/reduction_ops.cu",
+        "caffe2/operators/relu_n_op.cu",
+        "caffe2/operators/relu_op.cu",
+        "caffe2/operators/replace_nan_op.cu",
+        "caffe2/operators/resize_3d_op.cu",
+        "caffe2/operators/resize_op.cu",
+        "caffe2/operators/reverse_packed_segs_op.cu",
+        "caffe2/operators/rmac_regions_op.cu",
+        "caffe2/operators/rnn/recurrent_network_op_gpu.cu",
+        "caffe2/operators/roi_align_gradient_op.cu",
+        "caffe2/operators/roi_align_op.cu",
+        "caffe2/operators/roi_align_rotated_gradient_op.cu",
+        "caffe2/operators/roi_align_rotated_op.cu",
+        "caffe2/operators/roi_pool_op.cu",
+        "caffe2/operators/rsqrt_op.cu",
+        "caffe2/operators/scale_blobs_op.cu",
+        "caffe2/operators/segment_reduction_op_gpu.cu",
+        "caffe2/operators/selu_op.cu",
+        "caffe2/operators/sequence_ops.cu",
+        "caffe2/operators/sigmoid_op.cu",
+        "caffe2/operators/sin_op.cu",
+        "caffe2/operators/sinh_op.cu",
+        "caffe2/operators/slice_op.cu",
+        "caffe2/operators/softmax_ops.cu",
+        "caffe2/operators/softplus_op.cu",
+        "caffe2/operators/softsign_op.cu",
+        "caffe2/operators/space_batch_op_gpu.cu",
+        "caffe2/operators/sparse_normalize_op_gpu.cu",
+        "caffe2/operators/sparse_to_dense_op.cu",
+        "caffe2/operators/spatial_batch_norm_op.cu",
+        "caffe2/operators/spatial_batch_norm_op_cudnn.cu",
+        "caffe2/operators/stump_func_op.cu",
+        "caffe2/operators/summarize_op.cu",
+        "caffe2/operators/swish_op.cu",
+        "caffe2/operators/tan_op.cu",
+        "caffe2/operators/tanh_op.cu",
+        "caffe2/operators/thresholded_relu_op.cu",
+        "caffe2/operators/tile_op.cu",
+        "caffe2/operators/top_k.cu",
+        "caffe2/operators/transpose_op.cu",
+        "caffe2/operators/unique_ops.cu",
+        "caffe2/operators/upsample_op.cu",
+        "caffe2/operators/utility_ops.cu",
+        "caffe2/operators/weighted_sample_op.cu",
+        "caffe2/sgd/adadelta_op_gpu.cu",
+        "caffe2/sgd/adagrad_op_gpu.cu",
+        "caffe2/sgd/adam_op_gpu.cu",
+        "caffe2/sgd/fp16_momentum_sgd_op.cu",
+        "caffe2/sgd/fp32_momentum_sgd_op.cu",
+        "caffe2/sgd/lars_op_gpu.cu",
+        "caffe2/sgd/momentum_sgd_op_gpu.cu",
+        "caffe2/sgd/rmsprop_op_gpu.cu",
+        "caffe2/sgd/yellowfin_op_gpu.cu",
+        "caffe2/utils/math/broadcast.cu",
+        "caffe2/utils/math/elementwise.cu",
+        "caffe2/utils/math/reduce.cu",
+        "caffe2/utils/math/transpose.cu",
+        "caffe2/utils/math_gpu.cu",
     ],
 )
 
@@ -1432,6 +1363,29 @@ cc_library(
     ],
 )
 
+py_binary(
+    name = "gen_op",
+    srcs = ["caffe2/contrib/aten/gen_op.py"],
+    deps = [":tools_codegen"],
+)
+
+genrule(
+    name = "generated_caffe2_aten_op_headers",
+    srcs = [
+        "caffe2/contrib/aten/aten_op_template.h",
+        "aten/src/ATen/Declarations.yaml",
+    ],
+    outs = ["caffe2/caffe2/contrib/aten/gen_aten_op.h"],
+    cmd = """
+    $(location :gen_op) \
+        --output_prefix gen_ \
+        --install_dir $(@D) \
+        --aten_root `dirname $(location aten/src/ATen/Declarations.yaml)`/../.. \
+        --template_dir `dirname $(location caffe2/contrib/aten/aten_op_template.h)` \
+        --yaml_dir `dirname $(location aten/src/ATen/Declarations.yaml)`""",
+    tools = [":gen_op"],
+)
+
 cc_library(
     name = "caffe2_headers",
     hdrs = glob([
@@ -1472,7 +1426,7 @@ cc_library(
     ]) + if_cuda(glob([
         "caffe2/**/*.cuh",
         "caffe2/image/*.h",
-    ])),
+    ])) + [":generated_caffe2_aten_op_headers"],
     copts = CAFFE2_COPTS,
     includes = [
         "caffe2/contrib/aten",
@@ -1554,7 +1508,7 @@ cc_library(
         "@fmt",
     ] + if_cuda(
         [
-            ":caffe2_cpp_cuda",
+            ":caffe2_cuda_cpp",
             ":aten_cuda",
             "@tensorpipe//:tensorpipe_cuda",
         ],
@@ -1567,8 +1521,8 @@ cc_library(
 )
 
 cc_library(
-    name = "caffe2_cpp_cuda",
-    srcs = [":caffe2_cuda_srcs"],
+    name = "caffe2_cuda_cpp",
+    srcs = [":caffe2_cuda_cpp_srcs"],
     copts = CAFFE2_COPTS,
     visibility = ["//visibility:public"],
     deps = [
@@ -1586,7 +1540,6 @@ cu_library(
     deps = [
         ":aten",
         ":caffe2_headers",
-        "@cub",
         "@cuda//:cublas",
         "@cuda//:curand",
         "@cudnn",
@@ -1610,6 +1563,7 @@ PERF_COPTS = [
     "-DHAVE_SHM_OPEN=1",
     "-DHAVE_SHM_UNLINK=1",
     "-DSLEEF_STATIC_LIBS=1",
+    "-DTH_BALS_MKL",
     "-D_FILE_OFFSET_BITS=64",
     "-DUSE_FBGEMM",
     "-fvisibility-inlines-hidden",
@@ -1693,10 +1647,29 @@ genrule(
     srcs = ["torch/csrc/api/include/torch/version.h.in", "version.txt"],
     outs = ["torch/csrc/api/include/torch/version.h"],
     cmd = "$(location :gen_version_header) --template-path $(location torch/csrc/api/include/torch/version.h.in) --version-path $(location version.txt) --output-path $@",
-    tools = [':gen_version_header']
+    tools = [':gen_version_header'],
 )
 
-torch_cuda_headers = glob(["torch/csrc/cuda/*.h"])
+py_binary(
+    name = "stringify_file",
+    srcs = ["torch/csrc/jit/codegen/cuda/tools/stringify_file.py"],
+)
+
+generated_nvfuser_hdrs = ["generated_" + hdr for hdr in libtorch_nvfuser_generated_headers]
+
+[
+    genrule(
+        name = name,
+        srcs = [src],
+        outs = ["nvfuser_resources/{}".format(hdr)],
+        cmd = "$(location :stringify_file) -i $< -o $@",
+        tools = [":stringify_file"],
+    )
+    for name, src, hdr in zip(generated_nvfuser_hdrs, libtorch_nvfuser_runtime_sources, libtorch_nvfuser_generated_headers)
+]
+
+torch_cuda_headers = glob(["torch/csrc/cuda/*.h"]) + generated_nvfuser_hdrs
+
 cc_library(
     name = "torch_headers",
     hdrs = if_cuda(
@@ -1707,6 +1680,7 @@ cc_library(
             "torch/csrc/**/*.h",
             "torch/csrc/distributed/c10d/*.hpp",
             "torch/lib/libshm/*.h",
+            "torch/csrc/generic/*.cpp",
         ],
         exclude = [
             "torch/csrc/autograd/generated/VariableType.h",
@@ -1743,21 +1717,25 @@ TORCH_COPTS = COMMON_COPTS + [
     "-fno-trapping-math",
 ]
 
+cu_library(
+    name = "torch_distributed_cuda",
+    srcs = ["torch/csrc/distributed/c10d/quantization/quantization_gpu.cu"],
+    deps = [":torch_headers"],
+)
+
 cc_library(
     name = "torch",
     srcs = if_cuda(glob(
-        [
-            "torch/csrc/cuda/*.cpp",
-            "torch/csrc/autograd/functions/comm.cpp",
-        ],
+        libtorch_cuda_sources,
         exclude = [
             "torch/csrc/cuda/python_nccl.cpp",
             "torch/csrc/cuda/nccl.cpp",
+            "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
         ],
     )) + libtorch_core_sources + libtorch_distributed_sources + torch_cpp_srcs + libtorch_extra_sources + jit_core_sources + [
         ":cpp_generated_code",
     ],
-    copts = TORCH_COPTS + if_cuda(["-DUSE_CUDA=1"]),
+    copts = TORCH_COPTS,
     defines = [
         "CAFFE2_NIGHTLY_VERSION=20200115",
     ],
@@ -1765,7 +1743,10 @@ cc_library(
     deps = [
         ":caffe2",
         ":torch_headers",
-    ],
+    ] + if_cuda([
+        ":torch_distributed_cuda",
+        "@cuda//:nvToolsExt",
+    ]),
     alwayslink = True,
 )
 
@@ -1783,10 +1764,9 @@ cc_library(
         "**/*.h",
         "**/*.cuh",
     ]) + [
-        ":generated_code",
+        ":cpp_generated_code",
     ],
     includes = [
-        ".",
         "torch/csrc/api/include",
         "torch/csrc/distributed",
         "torch/lib",
@@ -1794,21 +1774,17 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":aten_headers",
-        ":caffe2_headers",
-        "//c10:headers",
+        ":torch_headers",
     ],
 )
 
 cc_library(
     name = "torch_python",
     srcs = libtorch_python_core_sources + [":python_generated_code"],
-    hdrs = glob([
-        "torch/csrc/generic/*.cpp",
-    ]),
     deps = [
         ":torch",
         ":shm",
+        "@pybind11",
     ],
 )
 
@@ -1842,11 +1818,16 @@ cc_library(
 # Torch integration tests rely on a labeled data set from the MNIST database.
 # http://yann.lecun.com/exdb/mnist/
 
-cpp_api_tests = glob(["test/cpp/api/*.cpp"])
+# imethod.cpp is excluded since torch/csrc/deploy* build is not yet supported.
+cpp_api_tests = glob(
+    ["test/cpp/api/*.cpp"],
+    exclude = ["test/cpp/api/imethod.cpp"],
+)
+
 [
   cc_test(
       name = paths.split_extension(paths.basename(filename))[0].replace("-","_") + "_test",
-    size = "medium",
+      size = "medium",
       srcs = [filename],
       deps = [
           ":test_support",
diff --git a/WORKSPACE b/WORKSPACE
index 9396a3451c36..0497bef41039 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,7 +1,22 @@
 workspace(name = "pytorch")
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-load("//tools/rules:workspace.bzl", "new_patched_local_repository", "new_empty_repository")
+load("//tools/rules:workspace.bzl", "new_patched_local_repository")
+
+http_archive(
+    name = "rules_cuda",
+    sha256 = "f80438bee9906e9ecb1a8a4ae2365374ac1e8a283897281a2db2fb7fcf746333",
+    strip_prefix = "runtime-b1c7cce21ba4661c17ac72421c6a0e2015e7bef3/third_party/rules_cuda",
+    urls = ["https://github.com/tensorflow/runtime/archive/b1c7cce21ba4661c17ac72421c6a0e2015e7bef3.tar.gz"],
+)
+
+load("@rules_cuda//cuda:dependencies.bzl", "rules_cuda_dependencies")
+
+rules_cuda_dependencies()
+
+load("@rules_cc//cc:repositories.bzl", "rules_cc_toolchains")
+
+rules_cc_toolchains()
 
 http_archive(
     name = "bazel_skylib",
@@ -171,13 +186,14 @@ load("@rules_python//python:repositories.bzl", "py_repositories")
 
 py_repositories()
 
-local_repository(
-    name = "local_config_cuda",
-    path = "third_party/tensorflow_cuda_bazel_build",
+new_local_repository(
+    name = "cuda",
+    build_file = "@//third_party:cuda.BUILD",
+    path = "/usr/local/cuda",
 )
 
-# Wrapper to expose local_config_cuda in an agnostic way
-new_empty_repository(
-    name = "cuda",
-    build_file = "//third_party:cuda.BUILD",
+new_local_repository(
+    name = "cudnn",
+    build_file = "@//third_party:cudnn.BUILD",
+    path = "/usr/",
 )
diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index aa47ae43b318..f82aacee2381 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -254,6 +254,7 @@ _(aten, conv_tbc) \
 _(aten, conv_tbc_backward) \
 _(aten, conv_transpose1d) \
 _(aten, convolution) \
+_(aten, convolution_backward) \
 _(aten, copy_sparse_to_sparse) \
 _(aten, corrcoef) \
 _(aten, cos) \
@@ -474,18 +475,8 @@ _(aten, min_values) \
 _(aten, miopen_batch_norm) \
 _(aten, miopen_batch_norm_backward) \
 _(aten, miopen_convolution) \
-_(aten, miopen_convolution_backward) \
-_(aten, miopen_convolution_backward_bias) \
-_(aten, miopen_convolution_backward_input) \
-_(aten, miopen_convolution_backward_weight) \
 _(aten, miopen_convolution_transpose) \
-_(aten, miopen_convolution_transpose_backward) \
-_(aten, miopen_convolution_transpose_backward_input) \
-_(aten, miopen_convolution_transpose_backward_weight) \
 _(aten, miopen_depthwise_convolution) \
-_(aten, miopen_depthwise_convolution_backward) \
-_(aten, miopen_depthwise_convolution_backward_input) \
-_(aten, miopen_depthwise_convolution_backward_weight) \
 _(aten, miopen_rnn) \
 _(aten, miopen_rnn_backward) \
 _(aten, mish) \
@@ -683,6 +674,7 @@ _(aten, take_along_dim) \
 _(aten, tan) \
 _(aten, tanh) \
 _(aten, tanh_) \
+_(aten, tanh_backward) \
 _(aten, tensor) \
 _(aten, tensordot) \
 _(aten, tensor_split) \
diff --git a/aten/src/ATen/mkl/SparseBlas.cpp b/aten/src/ATen/mkl/SparseBlas.cpp
index cc71ea6fbfef..67dcb30e5283 100644
--- a/aten/src/ATen/mkl/SparseBlas.cpp
+++ b/aten/src/ATen/mkl/SparseBlas.cpp
@@ -11,16 +11,16 @@ namespace sparse {
 
 namespace {
 
-  template <typename scalar_t, typename MKL_Complex>
-  MKL_Complex to_mkl_complex(c10::complex<scalar_t> scalar) {
-    MKL_Complex mkl_scalar;
-    mkl_scalar.real = scalar.real();
-    mkl_scalar.imag = scalar.imag();
-    return mkl_scalar;
-  }
-
+template <typename scalar_t, typename MKL_Complex>
+MKL_Complex to_mkl_complex(c10::complex<scalar_t> scalar) {
+  MKL_Complex mkl_scalar;
+  mkl_scalar.real = scalar.real();
+  mkl_scalar.imag = scalar.imag();
+  return mkl_scalar;
 }
 
+} // namespace
+
 // There are link errors when compiling with create_csr functions on Windows.
 // See https://github.com/pytorch/pytorch/pull/50937#issuecomment-779272492
 #if !defined(_WIN32)
@@ -60,6 +60,65 @@ void create_csr<c10::complex<double>>(
       col_indx,
       reinterpret_cast<MKL_Complex16*>(values)));
 }
+
+template <>
+void create_bsr<float>(MKL_SPARSE_CREATE_BSR_ARGTYPES(float)) {
+  TORCH_MKLSPARSE_CHECK(mkl_sparse_s_create_bsr(
+      A,
+      indexing,
+      block_layout,
+      rows,
+      cols,
+      block_size,
+      rows_start,
+      rows_end,
+      col_indx,
+      values));
+}
+template <>
+void create_bsr<double>(MKL_SPARSE_CREATE_BSR_ARGTYPES(double)) {
+  TORCH_MKLSPARSE_CHECK(mkl_sparse_d_create_bsr(
+      A,
+      indexing,
+      block_layout,
+      rows,
+      cols,
+      block_size,
+      rows_start,
+      rows_end,
+      col_indx,
+      values));
+}
+template <>
+void create_bsr<c10::complex<float>>(
+    MKL_SPARSE_CREATE_BSR_ARGTYPES(c10::complex<float>)) {
+  TORCH_MKLSPARSE_CHECK(mkl_sparse_c_create_bsr(
+      A,
+      indexing,
+      block_layout,
+      rows,
+      cols,
+      block_size,
+      rows_start,
+      rows_end,
+      col_indx,
+      reinterpret_cast<MKL_Complex8*>(values)));
+}
+template <>
+void create_bsr<c10::complex<double>>(
+    MKL_SPARSE_CREATE_BSR_ARGTYPES(c10::complex<double>)) {
+  TORCH_MKLSPARSE_CHECK(mkl_sparse_z_create_bsr(
+      A,
+      indexing,
+      block_layout,
+      rows,
+      cols,
+      block_size,
+      rows_start,
+      rows_end,
+      col_indx,
+      reinterpret_cast<MKL_Complex16*>(values)));
+}
 #endif // !defined(_WIN32)
 
 template <>
diff --git a/aten/src/ATen/mkl/SparseBlas.h b/aten/src/ATen/mkl/SparseBlas.h
index 140803b30f98..7281b6950611 100644
--- a/aten/src/ATen/mkl/SparseBlas.h
+++ b/aten/src/ATen/mkl/SparseBlas.h
@@ -42,6 +42,31 @@ template <>
 void create_csr<c10::complex<double>>(
     MKL_SPARSE_CREATE_CSR_ARGTYPES(c10::complex<double>));
 
+#define MKL_SPARSE_CREATE_BSR_ARGTYPES(scalar_t)                   \
+  sparse_matrix_t *A, const sparse_index_base_t indexing,          \
+      const sparse_layout_t block_layout, const MKL_INT rows,      \
+      const MKL_INT cols, MKL_INT block_size, MKL_INT *rows_start, \
+      MKL_INT *rows_end, MKL_INT *col_indx, scalar_t *values
+
+template <typename scalar_t>
+inline void create_bsr(MKL_SPARSE_CREATE_BSR_ARGTYPES(scalar_t)) {
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "at::mkl::sparse::create_bsr: not implemented for ",
+      typeid(scalar_t).name());
+}
+
+template <>
+void create_bsr<float>(MKL_SPARSE_CREATE_BSR_ARGTYPES(float));
+template <>
+void create_bsr<double>(MKL_SPARSE_CREATE_BSR_ARGTYPES(double));
+template <>
+void create_bsr<c10::complex<float>>(
+    MKL_SPARSE_CREATE_BSR_ARGTYPES(c10::complex<float>));
+template <>
+void create_bsr<c10::complex<double>>(
+    MKL_SPARSE_CREATE_BSR_ARGTYPES(c10::complex<double>));
+
 #define MKL_SPARSE_MV_ARGTYPES(scalar_t)                        \
   const sparse_operation_t operation, const scalar_t alpha,     \
       const sparse_matrix_t A, const struct matrix_descr descr, \
diff --git a/aten/src/ATen/mkl/SparseDescriptors.h b/aten/src/ATen/mkl/SparseDescriptors.h
index 2f4f8731adf3..46d656898a8d 100644
--- a/aten/src/ATen/mkl/SparseDescriptors.h
+++ b/aten/src/ATen/mkl/SparseDescriptors.h
@@ -92,21 +92,42 @@ class MklSparseCsrDescriptor
 
     crow_indices_ = prepare_indices_for_mkl(crow_indices);
     col_indices_ = prepare_indices_for_mkl(col_indices);
+    values_ = values.expect_contiguous();
 
-    auto values_ptr = values.data_ptr<scalar_t>();
+    auto values_ptr = values_->data_ptr<scalar_t>();
     auto crow_indices_ptr = crow_indices_->data_ptr<MKL_INT>();
     auto col_indices_ptr = col_indices_->data_ptr<MKL_INT>();
 
     sparse_matrix_t raw_descriptor;
-    create_csr<scalar_t>(
-        &raw_descriptor,
-        SPARSE_INDEX_BASE_ZERO,
-        rows,
-        cols,
-        crow_indices_ptr,
-        crow_indices_ptr + 1,
-        col_indices_ptr,
-        values_ptr);
+
+    // Assuming that the last two dimensions are block elements of the matrix
+    if (values.dim() == 3) {
+      TORCH_CHECK(
+          values.size(-1) == values.size(-2),
+          "MKL Sparse doesn't support matrices with non-square blocks.");
+      auto block_size = mkl_int_cast(values.size(-1), "block_size");
+      create_bsr<scalar_t>(
+          &raw_descriptor,
+          SPARSE_INDEX_BASE_ZERO,
+          SPARSE_LAYOUT_ROW_MAJOR,
+          rows / block_size,
+          cols / block_size,
+          block_size,
+          crow_indices_ptr,
+          crow_indices_ptr + 1,
+          col_indices_ptr,
+          values_ptr);
+    } else {
+      create_csr<scalar_t>(
+          &raw_descriptor,
+          SPARSE_INDEX_BASE_ZERO,
+          rows,
+          cols,
+          crow_indices_ptr,
+          crow_indices_ptr + 1,
+          col_indices_ptr,
+          values_ptr);
+    }
 
     descriptor_.reset(raw_descriptor);
   }
@@ -119,6 +140,7 @@ class MklSparseCsrDescriptor
  private:
   c10::MaybeOwned<Tensor> crow_indices_;
   c10::MaybeOwned<Tensor> col_indices_;
+  c10::MaybeOwned<Tensor> values_;
 };
 
 } // namespace sparse
diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 8e7b32a610ef..4a50ed392e70 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -9,11 +9,23 @@ namespace at { namespace native {
 using cudnn_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
     const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
     at::IntArrayRef, int64_t, bool, bool, bool, std::array<bool,2>);
+DECLARE_DISPATCH(cudnn_convolution_backward_fn, cudnn_convolution_backward_stub);
 using cudnn_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
     const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
     at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, bool, std::array<bool,2>);
-DECLARE_DISPATCH(cudnn_convolution_backward_fn, cudnn_convolution_backward_stub);
 DECLARE_DISPATCH(cudnn_convolution_transpose_backward_fn, cudnn_convolution_transpose_backward_stub);
+using miopen_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, int64_t, bool, bool, std::array<bool,3>);
+DECLARE_DISPATCH(miopen_convolution_backward_fn, miopen_convolution_backward_stub);
+using miopen_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, std::array<bool,3>);
+DECLARE_DISPATCH(miopen_convolution_transpose_backward_fn, miopen_convolution_transpose_backward_stub);
+using miopen_depthwise_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, int64_t, bool, bool, std::array<bool,3>);
+DECLARE_DISPATCH(miopen_depthwise_convolution_backward_fn, miopen_depthwise_convolution_backward_stub);
 
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 struct ConvParams {
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index d3426b5984d2..ab4413893a73 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -25,8 +25,14 @@ namespace at { namespace native {
 DEFINE_DISPATCH(cudnn_convolution_backward_stub);
 DEFINE_DISPATCH(cudnn_convolution_transpose_backward_stub);
 DEFINE_DISPATCH(convolution_depthwise3x3_winograd_stub);
+DEFINE_DISPATCH(miopen_convolution_backward_stub);
+DEFINE_DISPATCH(miopen_convolution_transpose_backward_stub);
+DEFINE_DISPATCH(miopen_depthwise_convolution_backward_stub);
 REGISTER_NO_CPU_DISPATCH(cudnn_convolution_backward_stub, cudnn_convolution_backward_fn);
 REGISTER_NO_CPU_DISPATCH(cudnn_convolution_transpose_backward_stub, cudnn_convolution_transpose_backward_fn);
+REGISTER_NO_CPU_DISPATCH(miopen_convolution_backward_stub, miopen_convolution_backward_fn);
+REGISTER_NO_CPU_DISPATCH(miopen_convolution_transpose_backward_stub, miopen_convolution_transpose_backward_fn);
+REGISTER_NO_CPU_DISPATCH(miopen_depthwise_convolution_backward_stub, miopen_depthwise_convolution_backward_fn);
 
 std::ostream& operator<<(std::ostream & out, const ConvParams& params) {
   out << "ConvParams {"
@@ -1614,20 +1620,23 @@ std::tuple<Tensor, Tensor, Tensor> convolution_backward(
     case ConvBackend::Miopen:
       check_input_same_type_as_parameters(input, weight);
       std::tie(backend_grad_input, backend_grad_weight, backend_grad_bias) =
-        at::miopen_convolution_backward(
+        miopen_convolution_backward_stub(
+          input.device().type(),
           input.contiguous(backend_memory_format), grad_output, weight, params.padding, params.stride,
           params.dilation, params.groups, params.benchmark, params.deterministic, output_mask);
       break;
     case ConvBackend::MiopenDepthwise:
       std::tie(backend_grad_input, backend_grad_weight, backend_grad_bias) =
-          at::miopen_depthwise_convolution_backward(
+          miopen_depthwise_convolution_backward_stub(
+            input.device().type(),
             input.contiguous(backend_memory_format), grad_output, weight, params.padding, params.stride,
             params.dilation, params.groups, params.benchmark, params.deterministic, output_mask);
       break;
     case ConvBackend::MiopenTranspose:
       check_input_same_type_as_parameters(input, weight);
       std::tie(backend_grad_input, backend_grad_weight, backend_grad_bias) =
-        at::miopen_convolution_transpose_backward(
+        miopen_convolution_transpose_backward_stub(
+          input.device().type(),
           input.contiguous(backend_memory_format), grad_output, weight, params.padding, params.output_padding,
           params.stride, params.dilation, params.groups, params.benchmark, params.deterministic, output_mask);
       break;
diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp
index 66c6232ef2f9..5544f3661de7 100644
--- a/aten/src/ATen/native/Distance.cpp
+++ b/aten/src/ATen/native/Distance.cpp
@@ -156,7 +156,7 @@ Tensor _cdist_forward(const Tensor& x1, const Tensor& x2, const double p, c10::o
   return result;
 }
 
-Tensor _cdist_backward(const Tensor& grad, const Tensor& _x1, const Tensor& _x2, const double p, const Tensor& cdist) {
+Tensor _cdist_backward(const Tensor& _grad, const Tensor& _x1, const Tensor& _x2, const double p, const Tensor& _cdist) {
   // Broadcasting might generate non-contiguous Tensors, so handle it before doing checks
   int64_t c1 = _x1.size(-1);
   int64_t c2 = _x2.size(-1);
@@ -182,17 +182,17 @@ Tensor _cdist_backward(const Tensor& grad, const Tensor& _x1, const Tensor& _x2,
 
   Tensor x1 = _x1;
   if (tensor1_expand_size != x1.sizes()) {
-    x1 = x1.expand(tensor1_expand_size).contiguous();
+    x1 = x1.expand(tensor1_expand_size);
   }
   Tensor x2 = _x2;
   if (tensor2_expand_size != x2.sizes()) {
-    x2 = x2.expand(tensor2_expand_size).contiguous();
+    x2 = x2.expand(tensor2_expand_size);
   }
 
-  TORCH_CHECK(x1.is_contiguous(), "_cdist_backward requires X1 to be contiguous");
-  TORCH_CHECK(x2.is_contiguous(), "_cdist_backward requires X2 to be contiguous");
-  TORCH_CHECK(cdist.is_contiguous(), "_cdist_backward requires dist to be contiguous");
-  TORCH_CHECK(grad.is_contiguous(), "_cdist_backward requires grad to be contiguous");
+  x1 = x1.contiguous();
+  x2 = x2.contiguous();
+  auto cdist = _cdist.contiguous();
+  auto grad = _grad.contiguous();
   int64_t n = x1.size(-2);
   int64_t m = x1.size(-1);
   auto device1 = x1.device().type();
diff --git a/aten/src/ATen/native/Itertools.cpp b/aten/src/ATen/native/Itertools.cpp
index 1e4fc7b746ea..d1117b8c1d4d 100644
--- a/aten/src/ATen/native/Itertools.cpp
+++ b/aten/src/ATen/native/Itertools.cpp
@@ -37,7 +37,7 @@ Tensor cartesian_prod(TensorList tensors) {
   if (tensors.size() == 1) {
     return tensors[0];
   }
-  std::vector<Tensor> grids = at::meshgrid(tensors);
+  std::vector<Tensor> grids = at::meshgrid(tensors, "ij");
   for(Tensor &t : grids) {
     t = t.flatten();
   }
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index 34b45b2f793a..7eda0fe0988c 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/Config.h>
+#include <ATen/native/ConvUtils.h>
 
 // TODO: Remove the condition on AT_ROCM_ENABLED entirely,
 // don't build this file as part of CPU build.
@@ -760,6 +761,228 @@ Tensor miopen_depthwise_convolution(
   return output_t;
 }
 
+// ---------------------------------------------------------------------
+//
+// Convolution backward (bias)
+//
+// ---------------------------------------------------------------------
+
+Tensor miopen_convolution_backward_bias(
+    const Tensor& grad_output_t)
+{
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 };
+
+  // TODO: Workaround since MIOpen does not support NHWC bias
+  // See #64426
+  std::vector<int64_t> discard_dims;
+  for( int i = 0; i < grad_output_t.dim(); i++ ) {
+      if(i != output_channels_dim ) {
+          discard_dims.push_back(i);
+      }
+  }
+
+  Tensor outputBias = at::squeeze( at::sum(grad_output_t, discard_dims, true) );
+  if( outputBias.dim() == 0 ) {
+      // always return a tensor of shape [_]
+      return outputBias.unsqueeze(0);
+  }
+  else {
+      return outputBias;
+  }
+
+/* MIOpen does not support NHWC bias. Activate once support is added.
+  auto grad_bias_t = at::empty( { grad_output->size(output_channels_dim) }, grad_output->options());
+
+  TensorArg grad_bias{ grad_bias_t, "result", 0 };
+
+  TensorDescriptor bdesc{grad_bias->expand({1, grad_bias->size(0)}),
+                         static_cast<size_t>(grad_output->dim())};
+  TensorDescriptor odesc{*grad_output};
+
+  auto handle = getMiopenHandle();
+  auto dataType = getMiopenDataType(*grad_bias);
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+
+  MIOPEN_CHECK(miopenConvolutionBackwardBias(handle, &one, odesc.desc(), grad_output->data_ptr(),
+                                                   &zero, bdesc.desc(), grad_bias->data_ptr()));
+  return *grad_bias;
+*/
+}
+
+// ---------------------------------------------------------------------
+//
+// Convolution backward (weight)
+//
+// ---------------------------------------------------------------------
+
+void raw_miopen_convolution_backward_weight_out(
+    const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
+    bool benchmark, bool deterministic) {
+
+  auto dataType = getMiopenDataType(input);
+  miopenConvolutionMode_t c_mode = miopenConvolution;
+
+  ConvolutionArgs args{ input, grad_output, grad_weight };
+  args.handle = getMiopenHandle();
+  setConvolutionParams(&args.params, args.handle, input, grad_weight, padding, stride, dilation, groups, deterministic);
+  args.idesc.set(input);
+  args.wdesc.set(grad_weight, input.suggest_memory_format(), 0);
+  args.odesc.set(grad_output);
+  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
+
+  miopenConvBwdWeightsAlgorithm_t bwdFilterAlg;
+  Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg);
+
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+
+  MIOPEN_CHECK(miopenConvolutionBackwardWeights(
+      args.handle,
+      &one, args.odesc.desc(), grad_output.data_ptr(),
+      args.idesc.desc(), input.data_ptr(),
+      args.cdesc.desc(), bwdFilterAlg, &zero,
+      args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size));
+}
+
+//Depthwise backward weights.
+void raw_miopen_depthwise_convolution_backward_weight_out(
+    const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
+    bool benchmark, bool deterministic) {
+
+  auto dataType = getMiopenDataType(input);
+  miopenConvolutionMode_t c_mode = miopenDepthwise;
+
+  ConvolutionArgs args{ input, grad_output, grad_weight };
+  args.handle = getMiopenHandle();
+  setConvolutionParams(&args.params, args.handle, input, grad_weight, padding, stride, dilation, groups, deterministic);
+  args.idesc.set(input);
+  args.wdesc.set(grad_weight, input.suggest_memory_format(), 0);
+  args.odesc.set(grad_output);
+  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
+
+  miopenConvBwdWeightsAlgorithm_t bwdFilterAlg;
+  Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg);
+
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+
+  MIOPEN_CHECK(miopenConvolutionBackwardWeights(
+      args.handle,
+      &one, args.odesc.desc(), grad_output.data_ptr(),
+      args.idesc.desc(), input.data_ptr(),
+      args.cdesc.desc(), bwdFilterAlg, &zero,
+      args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size));
+}
+
+Tensor miopen_depthwise_convolution_backward_weight(
+    CheckedFrom c,
+    IntArrayRef weight_size, const TensorArg& grad_output, const TensorArg& input,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+
+  checkAllSameType(c, {grad_output, input});
+  checkAllSameGPU(c, {grad_output, input});
+
+  auto memory_format = at::MemoryFormat::Contiguous;
+  if (miopen_conv_use_channels_last(*input, *grad_output)) {
+    memory_format = (input->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+  }
+
+  Tensor grad_output_contig_t = grad_output->contiguous(memory_format);
+  // Make sure that NC11 strides follow formula
+  grad_output_contig_t.resize_(grad_output_contig_t.sizes(), memory_format);
+  TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 };
+
+  Tensor input_contig_t = input->contiguous(memory_format);
+  input_contig_t.resize_(input_contig_t.sizes(), memory_format);
+  TensorArg input_contig{ input_contig_t, "input", 2};
+
+  auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), memory_format);
+
+  // For uniformity with everything else, although it seems grad_weight
+  // would be unambiguous too.
+  TensorArg grad_weight{ grad_weight_t, "result", 0 };
+  convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups);
+
+  raw_miopen_depthwise_convolution_backward_weight_out(
+      *grad_weight, *grad_output_contig, *input_contig,
+      padding, stride, dilation, groups, benchmark, deterministic);
+
+  return grad_weight_t;
+}
+
+Tensor miopen_depthwise_convolution_backward_weight(
+    IntArrayRef weight_size,
+    const Tensor& grad_output_t,
+    const Tensor& input_t,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
+            input{ input_t, "input", 2 };
+  return miopen_depthwise_convolution_backward_weight(
+      "miopen_depthwise_convolution_backward_weight",
+      weight_size, grad_output, input,
+      padding, stride, dilation, groups, benchmark, deterministic);
+}
+
+Tensor miopen_convolution_backward_weight(
+    CheckedFrom c,
+    IntArrayRef weight_size, const TensorArg& grad_output, const TensorArg& input,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+
+  checkAllSameType(c, {grad_output, input});
+  checkAllSameGPU(c, {grad_output, input});
+
+  auto memory_format = at::MemoryFormat::Contiguous;
+  if (miopen_conv_use_channels_last(*input, *grad_output)) {
+    memory_format = (input->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+  }
+
+  Tensor grad_output_contig_t = grad_output->contiguous(memory_format);
+  // Make sure that NC11 strides follow formula
+  grad_output_contig_t.resize_(grad_output_contig_t.sizes(), memory_format);
+  TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 };
+
+  Tensor input_contig_t = input->contiguous(memory_format);
+  input_contig_t.resize_(input_contig_t.sizes(), memory_format);
+  TensorArg input_contig{ input_contig_t, "input", 2};
+
+  auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), memory_format);
+
+  // For uniformity with everything else, although it seems grad_weight
+  // would be unambiguous too.
+  TensorArg grad_weight{ grad_weight_t, "result", 0 };
+  convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups);
+
+  raw_miopen_convolution_backward_weight_out(
+      *grad_weight, *grad_output_contig, *input_contig,
+      padding, stride, dilation, groups, benchmark, deterministic);
+
+  return grad_weight_t;
+}
+
+Tensor miopen_convolution_backward_weight(
+    IntArrayRef weight_size,
+    const Tensor& grad_output_t,
+    const Tensor& input_t,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
+            input{ input_t, "input", 2 };
+  return miopen_convolution_backward_weight(
+      "miopen_convolution_backward_weight",
+      weight_size, grad_output, input,
+      padding, stride, dilation, groups, benchmark, deterministic);
+}
+
 Tensor miopen_convolution_transpose_backward_input(
     const Tensor& grad_output_t, const Tensor& weight_t,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
@@ -772,6 +995,21 @@ Tensor miopen_convolution_transpose_backward_input(
     grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
 }
 
+Tensor miopen_convolution_transpose_backward_weight(
+    IntArrayRef weight_size,
+    const Tensor& grad_output_t,
+    const Tensor& input_t,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
+            input{ input_t, "input", 2 };
+  return miopen_convolution_backward_weight(
+      "miopen_convolution_backward_weight",
+      weight_size, input, grad_output,
+      padding, stride, dilation, groups, benchmark, deterministic);
+}
+
 std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_convolution_transpose_backward(
     const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
     IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
@@ -781,13 +1019,13 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_convolution_transpose_backwa
 
   Tensor grad_input, grad_weight, grad_bias;
   if (output_mask[0]) {
-    grad_input = at::miopen_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
+    grad_input = miopen_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
   }
   if (output_mask[1]) {
-    grad_weight = at::miopen_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic);
+    grad_weight = miopen_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic);
   }
   if (output_mask[2]) {
-    grad_bias = at::miopen_convolution_backward_bias(grad_output);
+    grad_bias = miopen_convolution_backward_bias(grad_output);
   }
 
   return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
@@ -994,13 +1232,13 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_convolution_backward(
 
   Tensor grad_input, grad_weight, grad_bias;
   if (output_mask[0]) {
-    grad_input = at::miopen_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
+    grad_input = miopen_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
   }
   if (output_mask[1]) {
-    grad_weight = at::miopen_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic);
+    grad_weight = miopen_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic);
   }
   if (output_mask[2]) {
-    grad_bias = at::miopen_convolution_backward_bias(grad_output);
+    grad_bias = miopen_convolution_backward_bias(grad_output);
   }
 
   return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
@@ -1015,13 +1253,13 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_depthwise_convolution_backwa
 
   Tensor grad_input, grad_weight, grad_bias;
   if (output_mask[0]) {
-    grad_input = at::miopen_depthwise_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
+    grad_input = miopen_depthwise_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
   }
   if (output_mask[1]) {
-    grad_weight = at::miopen_depthwise_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic);
+    grad_weight = miopen_depthwise_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic);
   }
   if (output_mask[2]) {
-    grad_bias = at::miopen_convolution_backward_bias(grad_output);
+    grad_bias = miopen_convolution_backward_bias(grad_output);
   }
 
   return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
@@ -1048,243 +1286,9 @@ Tensor miopen_convolution_transpose(
   return output_t;
 }
 
-// ---------------------------------------------------------------------
-//
-// Convolution backward (weight)
-//
-// ---------------------------------------------------------------------
-
-void raw_miopen_convolution_backward_weight_out(
-    const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic) {
-
-  auto dataType = getMiopenDataType(input);
-  miopenConvolutionMode_t c_mode = miopenConvolution;
-
-  ConvolutionArgs args{ input, grad_output, grad_weight };
-  args.handle = getMiopenHandle();
-  setConvolutionParams(&args.params, args.handle, input, grad_weight, padding, stride, dilation, groups, deterministic);
-  args.idesc.set(input);
-  args.wdesc.set(grad_weight, input.suggest_memory_format(), 0);
-  args.odesc.set(grad_output);
-  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
-
-  miopenConvBwdWeightsAlgorithm_t bwdFilterAlg;
-  Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg);
-
-  Constant one(dataType, 1);
-  Constant zero(dataType, 0);
-
-  MIOPEN_CHECK(miopenConvolutionBackwardWeights(
-      args.handle,
-      &one, args.odesc.desc(), grad_output.data_ptr(),
-      args.idesc.desc(), input.data_ptr(),
-      args.cdesc.desc(), bwdFilterAlg, &zero,
-      args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size));
-}
-
-Tensor miopen_convolution_backward_weight(
-    CheckedFrom c,
-    IntArrayRef weight_size, const TensorArg& grad_output, const TensorArg& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-
-  checkAllSameType(c, {grad_output, input});
-  checkAllSameGPU(c, {grad_output, input});
-
-  auto memory_format = at::MemoryFormat::Contiguous;
-  if (miopen_conv_use_channels_last(*input, *grad_output)) {
-    memory_format = (input->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
-  }
-
-  Tensor grad_output_contig_t = grad_output->contiguous(memory_format);
-  // Make sure that NC11 strides follow formula
-  grad_output_contig_t.resize_(grad_output_contig_t.sizes(), memory_format);
-  TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 };
-
-  Tensor input_contig_t = input->contiguous(memory_format);
-  input_contig_t.resize_(input_contig_t.sizes(), memory_format);
-  TensorArg input_contig{ input_contig_t, "input", 2};
-
-  auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), memory_format);
-
-  // For uniformity with everything else, although it seems grad_weight
-  // would be unambiguous too.
-  TensorArg grad_weight{ grad_weight_t, "result", 0 };
-  convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups);
-
-  raw_miopen_convolution_backward_weight_out(
-      *grad_weight, *grad_output_contig, *input_contig,
-      padding, stride, dilation, groups, benchmark, deterministic);
-
-  return grad_weight_t;
-}
-
-//Depthwise backward weights.
-void raw_miopen_depthwise_convolution_backward_weight_out(
-    const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic) {
-
-  auto dataType = getMiopenDataType(input);
-  miopenConvolutionMode_t c_mode = miopenDepthwise;
-
-  ConvolutionArgs args{ input, grad_output, grad_weight };
-  args.handle = getMiopenHandle();
-  setConvolutionParams(&args.params, args.handle, input, grad_weight, padding, stride, dilation, groups, deterministic);
-  args.idesc.set(input);
-  args.wdesc.set(grad_weight, input.suggest_memory_format(), 0);
-  args.odesc.set(grad_output);
-  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
-
-  miopenConvBwdWeightsAlgorithm_t bwdFilterAlg;
-  Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg);
-
-  Constant one(dataType, 1);
-  Constant zero(dataType, 0);
-
-  MIOPEN_CHECK(miopenConvolutionBackwardWeights(
-      args.handle,
-      &one, args.odesc.desc(), grad_output.data_ptr(),
-      args.idesc.desc(), input.data_ptr(),
-      args.cdesc.desc(), bwdFilterAlg, &zero,
-      args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size));
-}
-
-Tensor miopen_depthwise_convolution_backward_weight(
-    CheckedFrom c,
-    IntArrayRef weight_size, const TensorArg& grad_output, const TensorArg& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-
-  checkAllSameType(c, {grad_output, input});
-  checkAllSameGPU(c, {grad_output, input});
-
-  auto memory_format = at::MemoryFormat::Contiguous;
-  if (miopen_conv_use_channels_last(*input, *grad_output)) {
-    memory_format = (input->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
-  }
-
-  Tensor grad_output_contig_t = grad_output->contiguous(memory_format);
-  // Make sure that NC11 strides follow formula
-  grad_output_contig_t.resize_(grad_output_contig_t.sizes(), memory_format);
-  TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 };
-
-  Tensor input_contig_t = input->contiguous(memory_format);
-  input_contig_t.resize_(input_contig_t.sizes(), memory_format);
-  TensorArg input_contig{ input_contig_t, "input", 2};
-
-  auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), memory_format);
-
-  // For uniformity with everything else, although it seems grad_weight
-  // would be unambiguous too.
-  TensorArg grad_weight{ grad_weight_t, "result", 0 };
-  convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups);
-
-  raw_miopen_depthwise_convolution_backward_weight_out(
-      *grad_weight, *grad_output_contig, *input_contig,
-      padding, stride, dilation, groups, benchmark, deterministic);
-
-  return grad_weight_t;
-}
-
-Tensor miopen_convolution_backward_weight(
-    IntArrayRef weight_size,
-    const Tensor& grad_output_t,
-    const Tensor& input_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
-            input{ input_t, "input", 2 };
-  return miopen_convolution_backward_weight(
-      "miopen_convolution_backward_weight",
-      weight_size, grad_output, input,
-      padding, stride, dilation, groups, benchmark, deterministic);
-}
-
-Tensor miopen_convolution_transpose_backward_weight(
-    IntArrayRef weight_size,
-    const Tensor& grad_output_t,
-    const Tensor& input_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
-            input{ input_t, "input", 2 };
-  return miopen_convolution_backward_weight(
-      "miopen_convolution_backward_weight",
-      weight_size, input, grad_output,
-      padding, stride, dilation, groups, benchmark, deterministic);
-}
-
-Tensor miopen_depthwise_convolution_backward_weight(
-    IntArrayRef weight_size,
-    const Tensor& grad_output_t,
-    const Tensor& input_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
-            input{ input_t, "input", 2 };
-  return miopen_depthwise_convolution_backward_weight(
-      "miopen_depthwise_convolution_backward_weight",
-      weight_size, grad_output, input,
-      padding, stride, dilation, groups, benchmark, deterministic);
-}
-
-// ---------------------------------------------------------------------
-//
-// Convolution backward (bias)
-//
-// ---------------------------------------------------------------------
-
-Tensor miopen_convolution_backward_bias(
-    const Tensor& grad_output_t)
-{
-  TensorArg grad_output{ grad_output_t, "grad_output", 1 };
-
-  // TODO: Workaround since MIOpen does not support NHWC bias
-  // See #64426
-  std::vector<int64_t> discard_dims;
-  for( int i = 0; i < grad_output_t.dim(); i++ ) {
-      if(i != output_channels_dim ) {
-          discard_dims.push_back(i);
-      }
-  }
-
-  Tensor outputBias = at::squeeze( at::sum(grad_output_t, discard_dims, true) );
-  if( outputBias.dim() == 0 ) {
-      // always return a tensor of shape [_]
-      return outputBias.unsqueeze(0);
-  }
-  else {
-      return outputBias;
-  }
-
-/* MIOpen does not support NHWC bias. Activate once support is added.
-  auto grad_bias_t = at::empty( { grad_output->size(output_channels_dim) }, grad_output->options());
-
-  TensorArg grad_bias{ grad_bias_t, "result", 0 };
-
-  TensorDescriptor bdesc{grad_bias->expand({1, grad_bias->size(0)}),
-                         static_cast<size_t>(grad_output->dim())};
-  TensorDescriptor odesc{*grad_output};
-
-  auto handle = getMiopenHandle();
-  auto dataType = getMiopenDataType(*grad_bias);
-  Constant one(dataType, 1);
-  Constant zero(dataType, 0);
-
-  MIOPEN_CHECK(miopenConvolutionBackwardBias(handle, &one, odesc.desc(), grad_output->data_ptr(),
-                                                   &zero, bdesc.desc(), grad_bias->data_ptr()));
-  return *grad_bias;
-*/
-}
-
+REGISTER_CUDA_DISPATCH(miopen_convolution_backward_stub, &miopen_convolution_backward);
+REGISTER_CUDA_DISPATCH(miopen_convolution_transpose_backward_stub, &miopen_convolution_transpose_backward);
+REGISTER_CUDA_DISPATCH(miopen_depthwise_convolution_backward_stub, &miopen_depthwise_convolution_backward);
 
 }}  // namespace
 
diff --git a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
index 87850473c9ba..79f52bd25609 100644
--- a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
@@ -27,7 +27,7 @@ c10::MaybeOwned<Tensor> prepare_dense_matrix_for_mkl(
   if (tensor.is_non_overlapping_and_dense() ||
       is_blas_compatible_row_major_order(tensor) ||
       is_blas_compatible_column_major_order(tensor)) {
-    return c10::MaybeOwned<Tensor>::borrowed(tensor);
+    return at::native::expect_resolved_conj(tensor);
   } else {
     return c10::MaybeOwned<Tensor>::owned(
         tensor.clone(at::MemoryFormat::Contiguous));
@@ -45,7 +45,7 @@ c10::MaybeOwned<Tensor> prepare_dense_matrix_for_mkl(
     const Tensor& tensor,
     bool row_major) {
   if (is_blas_compatible_row_major_order(tensor) && row_major) {
-    return c10::MaybeOwned<Tensor>::borrowed(tensor);
+    return at::native::expect_resolved_conj(tensor);
   } else {
     if (row_major) {
       return c10::MaybeOwned<Tensor>::owned(
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 7ac1f6a06da1..7c01fe68bded 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3108,56 +3108,14 @@
   dispatch:
     CUDA: miopen_convolution
 
-- func: miopen_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  dispatch:
-    CUDA: miopen_convolution_backward_input
-
-- func: miopen_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  dispatch:
-    CUDA: miopen_convolution_backward
-
-- func: miopen_convolution_backward_bias(Tensor grad_output) -> Tensor
-  dispatch:
-    CUDA: miopen_convolution_backward_bias
-
-- func: miopen_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  dispatch:
-    CUDA: miopen_convolution_backward_weight
-
 - func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
   dispatch:
     CUDA: miopen_convolution_transpose
 
-# NB: output_padding not strictly needed here, but it's helpful for the float
-# backwards
-- func: miopen_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  dispatch:
-    CUDA: miopen_convolution_transpose_backward
-
-- func: miopen_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  dispatch:
-    CUDA: miopen_convolution_transpose_backward_input
-
-- func: miopen_convolution_transpose_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  dispatch:
-    CUDA: miopen_convolution_transpose_backward_weight
-
 - func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
   dispatch:
     CUDA: miopen_depthwise_convolution
 
-- func: miopen_depthwise_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  dispatch:
-    CUDA: miopen_depthwise_convolution_backward_input
-
-- func: miopen_depthwise_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  dispatch:
-    CUDA: miopen_depthwise_convolution_backward
-
-- func: miopen_depthwise_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  dispatch:
-    CUDA: miopen_depthwise_convolution_backward_weight
-
 - func: miopen_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
   dispatch:
     CUDA: miopen_rnn
diff --git a/aten/src/ATen/native/sparse/cuda/SoftMax.cu b/aten/src/ATen/native/sparse/cuda/SoftMax.cu
index 42561155a8a8..8ee25426fea9 100644
--- a/aten/src/ATen/native/sparse/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/sparse/cuda/SoftMax.cu
@@ -18,6 +18,7 @@
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 #include <thrust/system/cuda/execution_policy.h>
+#include <thrust/iterator/constant_iterator.h>
 
 #include <cuda_runtime_api.h>
 #include <cusparse.h>
diff --git a/docs/source/conf.py b/docs/source/conf.py
index a1633217fa8e..29387c27c270 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -208,7 +208,7 @@ coverage_missing_automodule = [
     "torch.ao.ns.fx",
     "torch.ao.quantization",
     "torch.ao.quantization.fx",
-    "torch.ao.quantization.fx.backend_config_dict",
+    "torch.ao.quantization.fx.backend_config",
     "torch.ao.sparsity",
     "torch.ao.sparsity.experimental",
     "torch.ao.sparsity.experimental.pruner",
diff --git a/docs/source/testing.rst b/docs/source/testing.rst
index 9f1e2c3c53f8..213e82b9c4ca 100644
--- a/docs/source/testing.rst
+++ b/docs/source/testing.rst
@@ -3,8 +3,8 @@ torch.testing
 
 .. warning::
 
-    This module is in a PROTOTYPE state. New functions are still being added, and the available functions may change in
-    future PyTorch releases. We are actively looking for feedback for UI/UX improvements or missing functionalities.
+    This module is a beta release, and its interfaces and functionality may change without warning in future
+    PyTorch releases.
 
 .. automodule:: torch.testing
 
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index 46f1c3bb263f..167723f8f157 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -83,6 +83,16 @@ ALLOW_LIST = [
     ("aten::hsplit", datetime.date(2021, 11, 20)),
     ("aten::dsplit", datetime.date(2021, 11, 20)),
     ("aten::_convolution_nogroup", datetime.date(9999, 1, 1)),
+    ("aten::miopen_convolution_backward", datetime.date(9999, 1, 1)),
+    ("aten::miopen_convolution_backward_bias", datetime.date(9999, 1, 1)),
+    ("aten::miopen_convolution_backward_input", datetime.date(9999, 1, 1)),
+    ("aten::miopen_convolution_backward_weight", datetime.date(9999, 1, 1)),
+    ("aten::miopen_convolution_transpose_backward", datetime.date(9999, 1, 1)),
+    ("aten::miopen_convolution_transpose_backward_input", datetime.date(9999, 1, 1)),
+    ("aten::miopen_convolution_transpose_backward_weight", datetime.date(9999, 1, 1)),
+    ("aten::miopen_depthwise_convolution_backward", datetime.date(9999, 1, 1)),
+    ("aten::miopen_depthwise_convolution_backward_input", datetime.date(9999, 1, 1)),
+    ("aten::miopen_depthwise_convolution_backward_weight", datetime.date(9999, 1, 1)),
     ("caffe2::", datetime.date(2021, 10, 23)),
     ("prepacked::unpack_prepacked_sizes_conv2d", datetime.date(9999, 1, 1)),
     ("prepacked::unpack_prepacked_sizes_linear", datetime.date(9999, 1, 1)),
diff --git a/test/cpp/tensorexpr/CMakeLists.txt b/test/cpp/tensorexpr/CMakeLists.txt
index 397b8b112aec..8fc5a0a18331 100644
--- a/test/cpp/tensorexpr/CMakeLists.txt
+++ b/test/cpp/tensorexpr/CMakeLists.txt
@@ -17,6 +17,7 @@ set(TENSOREXPR_TEST_SRCS
   ${TENSOREXPR_TEST_ROOT}/test_memdependency.cpp
   ${TENSOREXPR_TEST_ROOT}/test_ops.cpp
   ${TENSOREXPR_TEST_ROOT}/test_quantization.cpp
+  ${TENSOREXPR_TEST_ROOT}/test_memplanning.cpp
   ${TENSOREXPR_TEST_ROOT}/test_reductions.cpp
   ${TENSOREXPR_TEST_ROOT}/test_registerizer.cpp
   ${TENSOREXPR_TEST_ROOT}/test_simplify.cpp
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index d0cfbe3a9051..dd8950e8efa1 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -1901,7 +1901,8 @@ TEST(LoopNest, LoopNestComputeAt_1) {
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B.buf()).at(0);
   LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
   l.prepareForCodegen();
-  StmtPtr s = l.root_stmt();
+  SimpleIREvaluator cg(l.root_stmt(), {B, N});
+  StmtPtr s = cg.stmt();
 
   checkIR(s, R"IR(
 # CHECK: Allocate(temp); // dtype=int, dims=[1]
@@ -1913,7 +1914,6 @@ TEST(LoopNest, LoopNestComputeAt_1) {
 
   // Now check that the loop still produces the correct result.
   std::vector<int> b_data(100, 0);
-  SimpleIREvaluator cg(s, {B, N});
   cg.call({b_data, 100});
 
   std::vector<int> b_ref(100, 0);
@@ -1967,7 +1967,8 @@ TEST(LoopNest, LoopNestComputeAt_2) {
     std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
     LoopNest::computeAt(l.getLoopBodyFor(p), loops[0]);
     l.prepareForCodegen();
-    StmtPtr s = l.root_stmt();
+    SimpleIREvaluator cg(l.root_stmt(), {c, W, H});
+    StmtPtr s = cg.stmt();
 
     // Check the IR we produced
     checkIR(s, R"IR(
@@ -1982,7 +1983,6 @@ TEST(LoopNest, LoopNestComputeAt_2) {
 
     // Now check that the loop still produces the correct result.
     std::vector<int> c_data(kW * kH, 0);
-    SimpleIREvaluator cg(s, {c, W, H});
     cg.call({c_data, kW, kH});
 
     assertAllEqual(c_data, c_ref);
@@ -1993,7 +1993,8 @@ TEST(LoopNest, LoopNestComputeAt_2) {
     std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
     LoopNest::computeAt(l.getLoopBodyFor(p), loops[1]);
     l.prepareForCodegen();
-    StmtPtr s = l.root_stmt();
+    SimpleIREvaluator cg(l.root_stmt(), {c, W, H});
+    StmtPtr s = cg.stmt();
 
     // Check the IR we produced
     checkIR(s, R"IR(
@@ -2008,7 +2009,6 @@ TEST(LoopNest, LoopNestComputeAt_2) {
 
     // Now check that the loop still produces the correct result.
     std::vector<int> c_data(kW * kH, 0);
-    SimpleIREvaluator cg(s, {c, W, H});
     cg.call({c_data, kW, kH});
 
     assertAllEqual(c_data, c_ref);
@@ -2063,7 +2063,8 @@ TEST(LoopNest, LoopNestComputeAt_3) {
     std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(D.buf()).at(0);
     LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
     l.prepareForCodegen();
-    StmtPtr s = l.root_stmt();
+    SimpleIREvaluator cg(l.root_stmt(), {D, W, H});
+    StmtPtr s = cg.stmt();
 
     // Check the IR we produced
     checkIR(s, R"IR(
@@ -2083,7 +2084,6 @@ TEST(LoopNest, LoopNestComputeAt_3) {
 
     // Now check that the loop still produces the correct result.
     std::vector<int> c_data(kW * kH, 0);
-    SimpleIREvaluator cg(s, {D, W, H});
     cg.call({c_data, kW, kH});
 
     assertAllEqual(c_data, c_ref);
@@ -2094,7 +2094,8 @@ TEST(LoopNest, LoopNestComputeAt_3) {
     std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(D.buf()).at(0);
     LoopNest::computeAt(l.getLoopBodyFor(A), loops[1]);
     l.prepareForCodegen();
-    StmtPtr s = l.root_stmt();
+    SimpleIREvaluator cg(l.root_stmt(), {D, W, H});
+    StmtPtr s = cg.stmt();
 
     // Check the IR we produced
     checkIR(s, R"IR(
@@ -2114,7 +2115,6 @@ TEST(LoopNest, LoopNestComputeAt_3) {
 
     // Now check that the loop still produces the correct result.
     std::vector<int> c_data(kW * kH, 0);
-    SimpleIREvaluator cg(s, {D, W, H});
     cg.call({c_data, kW, kH});
 
     assertAllEqual(c_data, c_ref);
@@ -2174,7 +2174,8 @@ TEST(LoopNest, Reduce2dComputeAt) {
     // l.simplify();
     l.eliminateDeadStores();
     l.prepareForCodegen();
-    checkIR(l.root_stmt(), R"IR(
+    SimpleIREvaluator cg(l.root_stmt(), {c, W, H});
+    checkIR(cg.stmt(), R"IR(
 # CHECK: Allocate(temp); // dtype=int, dims=[2, W + 1]
 # CHECK: for (int cy = 0; cy < H; cy++) {
 # CHECK:   for (int idx0 = 0; idx0 < 2; idx0++) {
@@ -2193,11 +2194,9 @@ TEST(LoopNest, Reduce2dComputeAt) {
 # CHECK: }
 # CHECK: Free(temp);
 )IR");
-    StmtPtr s = l.root_stmt();
 
     // Now check that the loop still produces the correct result.
     std::vector<int> c_data(kW * kH, 0);
-    SimpleIREvaluator cg(s, {c, W, H});
     cg.call({c_data, kW, kH});
     assertAllEqual(c_data, c_ref);
   }
@@ -2209,7 +2208,8 @@ TEST(LoopNest, Reduce2dComputeAt) {
     l.simplify();
     l.eliminateDeadStores();
     l.prepareForCodegen();
-    checkIR(l.root_stmt(), R"IR(
+    SimpleIREvaluator cg(l.root_stmt(), {c, W, H});
+    checkIR(cg.stmt(), R"IR(
 # CHECK: Allocate(temp); // dtype=int, dims=[2, 2]
 # CHECK: for (int cy = 0; cy < H; cy++) {
 # CHECK:   for (int cx = 0; cx < W; cx++) {
@@ -2228,11 +2228,9 @@ TEST(LoopNest, Reduce2dComputeAt) {
 # CHECK: }
 # CHECK: Free(temp);
 )IR");
-    StmtPtr s = l.root_stmt();
 
     // Now check that the loop still produces the correct result.
     std::vector<int> c_data(kW * kH, 0);
-    SimpleIREvaluator cg(s, {c, W, H});
     cg.call({c_data, kW, kH});
     assertAllEqual(c_data, c_ref);
   }
@@ -3737,11 +3735,13 @@ TEST(LoopNest, CacheReadsSimple) {
 
   l.prepareForCodegen();
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  SimpleIREvaluator cg(result, {B, C});
+  result = cg.stmt();
 
   // just this once: verify the whole thing.
   checkIR(result, R"IR(
-#CHECK: Allocate(A_local); // dtype=int, dims=[1, 10]
 #CHECK: Allocate(A); // dtype=int, dims=[64, 64]
+#CHECK: Allocate(A_local); // dtype=int, dims=[1, 10]
 #CHECK: for (int i
 #CHECK:  for (int j
 #CHECK:   A[
@@ -3760,13 +3760,12 @@ TEST(LoopNest, CacheReadsSimple) {
 #CHECK:   C[
 #CHECK:  }
 #CHECK: }
-#CHECK: Free(A);
 #CHECK: Free(A_local);
+#CHECK: Free(A);
       )IR");
 
   std::vector<int> b_data(200, 0);
   std::vector<int> c_data(200, 0);
-  SimpleIREvaluator cg(l.root_stmt(), {B, C});
   cg.call({b_data, c_data});
 
   std::vector<int> b_ref(200, 0);
@@ -3803,6 +3802,8 @@ TEST(LoopNest, CacheReadsOuter) {
 
   l.prepareForCodegen();
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  SimpleIREvaluator cg(result, {B, C});
+  result = cg.stmt();
 
   checkIR(result, R"IR(
 #CHECK: Allocate(A_local); // dtype=int, dims=[21, 11]
@@ -3812,7 +3813,6 @@ TEST(LoopNest, CacheReadsOuter) {
 
   std::vector<int> b_data(200, 0);
   std::vector<int> c_data(200, 0);
-  SimpleIREvaluator cg(l.root_stmt(), {B, C});
   cg.call({b_data, c_data});
 
   std::vector<int> b_ref(200, 0);
@@ -3848,6 +3848,8 @@ TEST(LoopNest, CacheReadsInternal) {
   LoopNest::cacheAccesses(A.buf(), "A_local", j_loop);
   l.prepareForCodegen();
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  SimpleIREvaluator cg(result, {B, C});
+  result = cg.stmt();
 
   checkIR(result, R"IR(
 #CHECK: Allocate(A_local); // dtype=int, dims=[2, 11]
@@ -3857,7 +3859,6 @@ TEST(LoopNest, CacheReadsInternal) {
 
   std::vector<int> b_data(200, 0);
   std::vector<int> c_data(200, 0);
-  SimpleIREvaluator cg(l.root_stmt(), {B, C});
   cg.call({b_data, c_data});
 
   std::vector<int> b_ref(200, 0);
@@ -3894,6 +3895,8 @@ TEST(LoopNest, CacheReadsInner) {
   LoopNest::cacheAccesses(A.buf(), "A_local", body);
   l.prepareForCodegen();
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  SimpleIREvaluator cg(result, {B, C});
+  result = cg.stmt();
 
   checkIR(result, R"IR(
 #CHECK: Allocate(A_local); // dtype=int, dims=[5, 2]
@@ -3903,7 +3906,6 @@ TEST(LoopNest, CacheReadsInner) {
 
   std::vector<int> b_data(200, 0);
   std::vector<int> c_data(200, 0);
-  SimpleIREvaluator cg(l.root_stmt(), {B, C});
   cg.call({b_data, c_data});
 
   std::vector<int> b_ref(200, 0);
@@ -3940,6 +3942,8 @@ TEST(LoopNest, CacheWritesSimple) {
 
   l.prepareForCodegen();
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  SimpleIREvaluator cg(result, {B, C});
+  result = cg.stmt();
 
   checkIR(result, R"IR(
 #CHECK: Allocate(A_local); // dtype=int, dims=[1, 64]
@@ -3953,7 +3957,6 @@ TEST(LoopNest, CacheWritesSimple) {
 
   std::vector<int> b_data(200, 0);
   std::vector<int> c_data(200, 0);
-  SimpleIREvaluator cg(l.root_stmt(), {B, C});
   cg.call({b_data, c_data});
 
   std::vector<int> b_ref(200, 0);
diff --git a/test/cpp/tensorexpr/test_memdependency.cpp b/test/cpp/tensorexpr/test_memdependency.cpp
index d3ac6f4a5bd0..7019353937b7 100644
--- a/test/cpp/tensorexpr/test_memdependency.cpp
+++ b/test/cpp/tensorexpr/test_memdependency.cpp
@@ -3021,10 +3021,11 @@ TEST(MemDependency, MemDependencyCheckerComputeGEMM) {
   }
 
   loop.prepareForCodegen();
+  SimpleIREvaluator cg(loop.root_stmt(), {AP, BP, CT});
 
   // now check lowered dependency graph.
   {
-    StmtPtr stmt = IRSimplifier::simplify(loop.root_stmt());
+    StmtPtr stmt = IRSimplifier::simplify(cg.stmt());
     stmt->accept(&analyzer_lowered);
 
     // Lowering will change the dimensionality of all bounds due to index
diff --git a/test/cpp/tensorexpr/test_memplanning.cpp b/test/cpp/tensorexpr/test_memplanning.cpp
new file mode 100644
index 000000000000..ec58aa8f6668
--- /dev/null
+++ b/test/cpp/tensorexpr/test_memplanning.cpp
@@ -0,0 +1,510 @@
+#include <gtest/gtest.h>
+#include <test/cpp/tensorexpr/test_base.h>
+
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::tensorexpr;
+
+extern void checkIR(StmtPtr s, const std::string& pattern);
+
+TEST(BufLiveRange, SingleRangeLine) {
+  VarHandle i("i", kInt), j("j", kInt);
+  BufHandle a("a", {32}, kFloat);
+  BufHandle b("b", {32, 32}, kFloat);
+
+  // Construct Stmt:
+  // {
+  //   for (int i = 0; i < 32; i++) {
+  //     a[i] = 0;
+  //     for (int j = 0; j < 32; j++) {
+  //       a[i] = (a[i]) + (b[i, j]);
+  //     }
+  //   }
+  // }
+
+  StorePtr aInit = Store::make(a, {i}, 0);
+  ExprHandle reduce = a.load({i}) + b.load({i, j});
+  StorePtr aReduce = Store::make(a, {i}, reduce);
+  StmtPtr loop =
+      For::make(i, 0, 32, Block::make({aInit, For::make(j, 0, 32, aReduce)}));
+
+  StmtPtr stmt = Block::make({loop});
+
+  auto range = BufLiveRange::liveRange(stmt, a.node());
+  ASSERT_TRUE(std::get<0>(range) == 0);
+  ASSERT_TRUE(std::get<1>(range) == 0);
+}
+
+TEST(BufLiveRange, MulRangeLine) {
+  VarHandle i("i", kInt);
+  BufHandle a("a", {32}, kFloat);
+  BufHandle b("b", {32}, kFloat);
+
+  // Construct Stmt:
+  // {
+  //   for (int i = 0; i < 32; i++) {
+  //     if (i<10 ? 1 : 0) {
+  //       a[i] = i + i;
+  //       b[i] = i * i;
+  //     }
+  //   }
+  //   for (int i = 0; i < 32; i++) {
+  //     if (i>10 ? 1 : 0) {
+  //       a[i] = i * i;
+  //       b[i] = i + i;
+  //     }
+  //   }
+  // }
+
+  StorePtr aStore_1 = Store::make(a, {i}, i + i);
+  StorePtr bStore_1 = Store::make(b, {i}, i * i);
+  StmtPtr loop_1 = For::make(
+      i, 0, 32, Cond::make(i < 10, Block::make({aStore_1, bStore_1}), NULL));
+
+  StorePtr aStore_2 = Store::make(a, {i}, i * i);
+  StorePtr bStore_2 = Store::make(b, {i}, i + i);
+  StmtPtr loop_2 = For::make(
+      i, 0, 32, Cond::make(i > 10, Block::make({aStore_2, bStore_2}), NULL));
+
+  StmtPtr stmt = Block::make({loop_1, loop_2});
+
+  auto range_a = BufLiveRange::liveRange(stmt, a.node());
+  ASSERT_TRUE(std::get<0>(range_a) == 0);
+  ASSERT_TRUE(std::get<1>(range_a) == 1);
+
+  auto range_b = BufLiveRange::liveRange(stmt, b.node());
+  ASSERT_TRUE(std::get<0>(range_b) == 0);
+  ASSERT_TRUE(std::get<1>(range_b) == 1);
+}
+
+TEST(MemPlanning, SameBufSizeMemReuse) {
+  int M = 1024;
+  int N = 1024;
+  int K = 2048;
+
+  BufHandle AP("A", {M, K}, kFloat);
+  BufHandle BP("B", {K, N}, kFloat);
+
+  Tensor CT = Reduce(
+      "gemm",
+      {{M, "M"}, {N, "N"}},
+      Sum(),
+      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
+        return AP.load(m, k) * BP.load(k, n);
+      },
+      {{K, "K"}});
+  Tensor DT = Compute(
+      "relu",
+      {{M, "M"}, {N, "N"}},
+      [&](const ExprHandle& m, const ExprHandle& n) {
+        auto zero = Cast::make(CT.buf()->dtype(), 0);
+        return CompareSelect::make(
+            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
+      });
+  Tensor ET = Compute(
+      "add",
+      {{M, "M"}, {N, "N"}},
+      [&](const ExprHandle& m, const ExprHandle& n) {
+        return DT.load(m, n) + DT.load(m, n);
+      });
+  Tensor FT = Compute(
+      "mul",
+      {{M, "M"}, {N, "N"}},
+      [&](const ExprHandle& m, const ExprHandle& n) {
+        return ET.load(m, n) * ET.load(m, n);
+      });
+  auto stmt = Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()});
+
+  // Constructed stmt:
+  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
+  // add [2, 3] Buffer 'gemm' and 'add' are the same size; we'll reuse 'gemm'
+  // for 'add'.
+  //{
+  //  for (int M = 0; M < 1024; M++) {
+  //    for (int N = 0; N < 1024; N++) {
+  //      gemm[M, N] = float(0);
+  //      for (int K = 0; K < 2048; K++) {
+  //        gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]),
+  //        reduce_args={K});
+  //      }
+  //    }
+  //  }
+  //  for (int M_1 = 0; M_1 < 1024; M_1++) {
+  //    for (int N_1 = 0; N_1 < 1024; N_1++) {
+  //      relu[M_1, N_1] = (gemm[M_1, N_1])<float(0) ? float(0) : (gemm[M_1,
+  //      N_1]);
+  //    }
+  //  }
+  //  for (int M_2 = 0; M_2 < 1024; M_2++) {
+  //    for (int N_2 = 0; N_2 < 1024; N_2++) {
+  //      add[M_2, N_2] = (relu[M_2, N_2]) + (relu[M_2, N_2]);
+  //    }
+  //  }
+  //  for (int M_3 = 0; M_3 < 1024; M_3++) {
+  //    for (int N_3 = 0; N_3 < 1024; N_3++) {
+  //      mul[M_3, N_3] = (add[M_3, N_3]) * (add[M_3, N_3]);
+  //    }
+  //  }
+  //}
+
+  SimpleIREvaluator cg(stmt, {AP, BP, FT});
+
+  checkIR(cg.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
+# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
+# CHECK: Alias(add,gemm);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+
+#ifdef TORCH_ENABLE_LLVM
+  LoopNest loop(Stmt::clone(stmt), {FT.buf()});
+  loop.prepareForCodegen();
+  LLVMCodeGen cg_llvm(loop.root_stmt(), {AP, BP, FT});
+
+  checkIR(cg_llvm.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
+# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
+# CHECK: Alias(add,gemm);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+#endif
+}
+
+TEST(MemPlanning, SameBufSizeMultiMemReuses) {
+  int M = 1024;
+  int N = 1024;
+  int K = 2048;
+
+  BufHandle AP("A", {M, K}, kFloat);
+  BufHandle BP("B", {K, N}, kFloat);
+
+  Tensor CT = Reduce(
+      "gemm",
+      {{M, "M"}, {N, "N"}},
+      Sum(),
+      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
+        return AP.load(m, k) * BP.load(k, n);
+      },
+      {{K, "K"}});
+  Tensor DT = Compute(
+      "relu",
+      {{M, "M"}, {N, "N"}},
+      [&](const ExprHandle& m, const ExprHandle& n) {
+        auto zero = Cast::make(CT.buf()->dtype(), 0);
+        return CompareSelect::make(
+            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
+      });
+  Tensor ET = Compute(
+      "add",
+      {{M, "M"}, {N, "N"}},
+      [&](const ExprHandle& m, const ExprHandle& n) {
+        return DT.load(m, n) + DT.load(m, n);
+      });
+  Tensor FT = Compute(
+      "mul",
+      {{M, "M"}, {N, "N"}},
+      [&](const ExprHandle& m, const ExprHandle& n) {
+        return ET.load(m, n) * ET.load(m, n);
+      });
+  Tensor GT = Compute(
+      "sub",
+      {{M, "M"}, {N, "N"}},
+      [&](const ExprHandle& m, const ExprHandle& n) {
+        return FT.load(m, n) - ET.load(m, n);
+      });
+
+  auto stmt =
+      Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt(), GT.stmt()});
+
+  // Constructed stmt:
+  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
+  // add [2, 3], mul [3, 4] Buffer 'gemm', 'relu, ''add' and 'mul' are the same
+  // size; we'll reuse 'gemm' for 'add', and reuse 'relu' for 'mul'
+  //{
+  //  for (int M = 0; M < 1024; M++) {
+  //    for (int N = 0; N < 1024; N++) {
+  //      gemm[M, N] = float(0);
+  //      for (int K = 0; K < 2048; K++) {
+  //        gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]),
+  //        reduce_args={K});
+  //      }
+  //    }
+  //  }
+  //  for (int M_1 = 0; M_1 < 1024; M_1++) {
+  //    for (int N_1 = 0; N_1 < 1024; N_1++) {
+  //      relu[M_1, N_1] = (gemm[M_1, N_1])<float(0) ? float(0) : (gemm[M_1,
+  //      N_1]);
+  //    }
+  //  }
+  //  for (int M_2 = 0; M_2 < 1024; M_2++) {
+  //    for (int N_2 = 0; N_2 < 1024; N_2++) {
+  //      add[M_2, N_2] = (relu[M_2, N_2]) + (relu[M_2, N_2]);
+  //    }
+  //  }
+  //  for (int M_3 = 0; M_3 < 1024; M_3++) {
+  //    for (int N_3 = 0; N_3 < 1024; N_3++) {
+  //      mul[M_3, N_3] = (add[M_3, N_3]) * (add[M_3, N_3]);
+  //    }
+  //  }
+  //  for (int M_4 = 0; M_4 < 1024; M_4++) {
+  //    for (int N_4 = 0; N_4 < 1024; N_4++) {
+  //      sub[M_4, N_4] = (mul[M_4, N_4]) - (add[M_4, N_4]);
+  //    }
+  //  }
+  //}
+
+  SimpleIREvaluator cg(stmt, {AP, BP, GT});
+
+  checkIR(cg.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
+# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
+# CHECK: Alias(add,gemm);
+# CHECK: Alias(mul,relu);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+
+#ifdef TORCH_ENABLE_LLVM
+  LoopNest loop(Stmt::clone(stmt), {FT.buf()});
+  loop.prepareForCodegen();
+  LLVMCodeGen cg_llvm(loop.root_stmt(), {AP, BP, FT});
+
+  checkIR(cg_llvm.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
+# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
+# CHECK: Alias(add,gemm);
+# CHECK: Alias(mul,relu);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+#endif
+}
+
+TEST(MemPlanning, SameBufSizeMultiMemReusesOfOneBuf) {
+  int M = 1024;
+  int N = 1024;
+  int K = 2048;
+
+  BufHandle AP("A", {M, K}, kFloat);
+  BufHandle BP("B", {K, N}, kFloat);
+
+  Tensor CT = Reduce(
+      "gemm",
+      {{M, "M"}, {N, "N"}},
+      Sum(),
+      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
+        return AP.load(m, k) * BP.load(k, n);
+      },
+      {{K, "K"}});
+  Tensor DT = Compute(
+      "relu",
+      {{M, "M"}, {N, "N"}},
+      [&](const ExprHandle& m, const ExprHandle& n) {
+        auto zero = Cast::make(CT.buf()->dtype(), 0);
+        return CompareSelect::make(
+            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
+      });
+  Tensor ET = Compute(
+      "add",
+      {{M, "M"}, {N, "N"}},
+      [&](const ExprHandle& m, const ExprHandle& n) {
+        return DT.load(m, n) + DT.load(m, n);
+      });
+  Tensor FT = Compute(
+      "mul",
+      {{M, "M"}, {N, "N"}},
+      [&](const ExprHandle& m, const ExprHandle& n) {
+        return ET.load(m, n) * ET.load(m, n);
+      });
+  Tensor GT = Compute(
+      "sub",
+      {{M, "M"}, {N, "N"}},
+      [&](const ExprHandle& m, const ExprHandle& n) {
+        return FT.load(m, n) - 1;
+      });
+  Tensor HT = Compute(
+      "div",
+      {{M, "M"}, {N, "N"}},
+      [&](const ExprHandle& m, const ExprHandle& n) {
+        return GT.load(m, n) / 2;
+      });
+
+  auto stmt = Block::make(
+      {CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt(), GT.stmt(), HT.stmt()});
+
+  // Constructed stmt:
+  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
+  // add [2, 3], mul [3, 4], sub [4, 5] Buffer 'gemm', 'relu, ''add', 'mul' and
+  // 'sub' are the same size; we'll reuse 'gemm' for 'add', reuse 'relu' for
+  // 'mul', and reuse 'gemm' for 'sub'.
+  //{
+  //  for (int M = 0; M < 1024; M++) {
+  //    for (int N = 0; N < 1024; N++) {
+  //      gemm[M, N] = float(0);
+  //      for (int K = 0; K < 2048; K++) {
+  //        gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]),
+  //        reduce_args={K});
+  //      }
+  //    }
+  //  }
+  //  for (int M_1 = 0; M_1 < 1024; M_1++) {
+  //    for (int N_1 = 0; N_1 < 1024; N_1++) {
+  //      relu[M_1, N_1] = (gemm[M_1, N_1])<float(0) ? float(0) : (gemm[M_1,
+  //      N_1]);
+  //    }
+  //  }
+  //  for (int M_2 = 0; M_2 < 1024; M_2++) {
+  //    for (int N_2 = 0; N_2 < 1024; N_2++) {
+  //      add[M_2, N_2] = (relu[M_2, N_2]) + (relu[M_2, N_2]);
+  //    }
+  //  }
+  //  for (int M_3 = 0; M_3 < 1024; M_3++) {
+  //    for (int N_3 = 0; N_3 < 1024; N_3++) {
+  //      mul[M_3, N_3] = (add[M_3, N_3]) * (add[M_3, N_3]);
+  //    }
+  //  }
+  //  for (int M_4 = 0; M_4 < 1024; M_4++) {
+  //    for (int N_4 = 0; N_4 < 1024; N_4++) {
+  //      sub[M_4, N_4] = (mul[M_4, N_4]) - float(1);
+  //    }
+  //  }
+  //  for (int M_5 = 0; M_5 < 1024; M_5++) {
+  //    for (int N_5 = 0; N_5 < 1024; N_5++) {
+  //      div[M_5, N_5] = (sub[M_5, N_5]) / float(2);
+  //    }
+  //  }
+  //}
+
+  SimpleIREvaluator cg(stmt, {AP, BP, HT});
+
+  checkIR(cg.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
+# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
+# CHECK: Alias(add,gemm);
+# CHECK: Alias(mul,relu);
+# CHECK: Alias(sub,gemm);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+
+#ifdef TORCH_ENABLE_LLVM
+  LoopNest loop(Stmt::clone(stmt), {FT.buf()});
+  loop.prepareForCodegen();
+  LLVMCodeGen cg_llvm(loop.root_stmt(), {AP, BP, FT});
+
+  checkIR(cg_llvm.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
+# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
+# CHECK: Alias(add,gemm);
+# CHECK: Alias(mul,relu);
+# CHECK: Alias(sub,gemm);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+#endif
+}
+
+TEST(MemPlanning, SmallerBufSizeNonMemReuse) {
+  int M = 1024;
+  int N = 1024;
+  int K = 2048;
+
+  BufHandle AP("A", {M, K}, kFloat);
+  BufHandle BP("B", {K, N}, kFloat);
+
+  Tensor CT = Reduce(
+      "gemm",
+      {{M, "M"}, {N, "N"}},
+      Sum(),
+      [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
+        return AP.load(m, k) * BP.load(k, n);
+      },
+      {{K, "K"}});
+  Tensor DT = Compute(
+      "relu",
+      {{M, "M"}, {N, "N"}},
+      [&](const ExprHandle& m, const ExprHandle& n) {
+        auto zero = Cast::make(CT.buf()->dtype(), 0);
+        return CompareSelect::make(
+            CT.load(m, n), zero, zero, CT.load(m, n), kLT);
+      });
+  Tensor ET = Compute(
+      "add",
+      {{M * 2, "EM"}, {N * 2, "EN"}},
+      [&](const ExprHandle& em, const ExprHandle& en) {
+        return DT.load(em / 2, en / 2) + DT.load(em / 2, en / 2);
+      });
+  Tensor FT = Compute(
+      "mul",
+      {{M * 2, "FM"}, {N * 2, "FN"}},
+      [&](const ExprHandle& fm, const ExprHandle& fn) {
+        return ET.load(fm, fn) * ET.load(fm, fn);
+      });
+  auto stmt = Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()});
+
+  // Constructed stmt:
+  // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2],
+  // add [2, 3] We do not reuse buffer 'gemm' for 'add' because the size of
+  // buffer 'gemm' is smaller.
+  //{
+  //  for (int M = 0; M < 1024; M++) {
+  //    for (int N = 0; N < 1024; N++) {
+  //      gemm[M, N] = float(0);
+  //      for (int K = 0; K < 2048; K++) {
+  //        gemm[M, N] = ReduceOp((gemm[M, N]) + (A[M, K]) * (B[K, N]),
+  //        reduce_args={K});
+  //      }
+  //    }
+  //  }
+  //  for (int M_1 = 0; M_1 < 1024; M_1++) {
+  //    for (int N_1 = 0; N_1 < 1024; N_1++) {
+  //      relu[M_1, N_1] = (gemm[M_1, N_1])<float(0) ? float(0) : (gemm[M_1,
+  //      N_1]);
+  //    }
+  //  }
+  //  for (int EM = 0; EM < 2048; EM++) {
+  //    for (int EN = 0; EN < 2048; EN++) {
+  //      add[EM, EN] = (relu[EM / 2, EN / 2]) + (relu[EM / 2, EN / 2]);
+  //    }
+  //  }
+  //  for (int FM = 0; FM < 2048; FM++) {
+  //    for (int FN = 0; FN < 2048; FN++) {
+  //      mul[FM, FN] = (add[FM, FN]) * (add[FM, FN]);
+  //    }
+  //  }
+  //}
+  //
+
+  SimpleIREvaluator cg(stmt, {AP, BP, FT});
+
+  checkIR(cg.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
+# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
+# CHECK-NOT: Alias(add,gemm);
+# CHECK: Allocate(add); // dtype=float, dims=[2048, 2048]
+# CHECK: Free(add);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+
+#ifdef TORCH_ENABLE_LLVM
+  LoopNest loop(Stmt::clone(stmt), {FT.buf()});
+  loop.prepareForCodegen();
+  LLVMCodeGen cg_llvm(loop.root_stmt(), {AP, BP, FT});
+
+  checkIR(cg_llvm.stmt(), R"IR(
+# CHECK: Allocate(gemm); // dtype=float, dims=[1024, 1024]
+# CHECK: Allocate(relu); // dtype=float, dims=[1024, 1024]
+# CHECK-NOT: Alias(add,gemm);
+# CHECK: Allocate(add); // dtype=float, dims=[2048, 2048]
+# CHECK: Free(add);
+# CHECK: Free(relu);
+# CHECK: Free(gemm))IR");
+#endif
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
index ebf59a3a4d5b..27684b5fa83a 100644
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ b/test/cpp/tensorexpr/test_reductions.cpp
@@ -1102,7 +1102,7 @@ TEST(Reductions, ReduceOverSplitRfactor) {
   ASSERT_EQ(out[0], 4950);
 
   std::ostringstream oss;
-  oss << *s;
+  oss << *cg.stmt();
 
   // Check the IR to verify the rfactored reduce is eliminated.
   // TODO: The alloc free should be eliminated here since it is size 0.
@@ -1295,7 +1295,7 @@ TEST(Reductions, ReductionCacheAccessesOperatorAxis) {
   SimpleIREvaluator cg_after(result, {a, b, e});
 
   std::ostringstream oss;
-  oss << *result;
+  oss << *cg_after.stmt();
   const std::string& expected_ir =
       R"IR(
 #CHECK: Allocate(d_local); // dtype=float, dims=[4]
@@ -1370,7 +1370,7 @@ TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) {
   SimpleIREvaluator cg_after(result, {a, b, e});
 
   std::ostringstream oss;
-  oss << *result;
+  oss << *cg_after.stmt();
   const std::string& expected_ir =
       R"IR(
 #CHECK: Allocate(d_local); // dtype=float, dims=[1]
@@ -1443,7 +1443,7 @@ TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) {
   SimpleIREvaluator cg_after(result, {a, b, e});
 
   std::ostringstream oss;
-  oss << *result;
+  oss << *cg_after.stmt();
   const std::string& expected_ir =
       R"IR(
 #CHECK: Allocate(d_local); // dtype=float, dims=[1]
@@ -1506,9 +1506,10 @@ TEST(Reductions, ReductionCacheBodyAccess) {
 
   l.prepareForCodegen();
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  SimpleIREvaluator cg(result, {a, b, e});
 
   std::ostringstream oss;
-  oss << *result;
+  oss << *cg.stmt();
   const std::string& expected_ir =
       R"IR(
 #CHECK: Allocate(scale_local); // dtype=float, dims=[1, 32, 12]
@@ -1547,12 +1548,14 @@ TEST(Reductions, ReductionCacheConsumerAccess) {
   l.prepareForCodegen();
 
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  SimpleIREvaluator cg(result, {a, b, e});
 
+  std::cout << *cg.stmt() << std::endl;
   std::ostringstream oss;
-  oss << *result;
+  oss << *cg.stmt();
   const std::string& expected_ir =
       R"IR(
-#CHECK: Allocate(sum_local); // dtype=float, dims=[4]
+#CHECK: Alias(sum_local,scale);
 #CHECK: sum[l1] = (sum[l1]) + (scale[
 #CHECK: for (int i = 0; i < 4
 #CHECK:   sum_local[i] = sum[i + 4 * l_outer];
@@ -1592,13 +1595,14 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) {
   l.prepareForCodegen();
 
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  SimpleIREvaluator cg(result, {a, b, e});
 
   // reduction changes but cache does not.
   std::ostringstream oss;
-  oss << *result;
+  oss << *cg.stmt();
   const std::string& expected_ir =
       R"IR(
-#CHECK: Allocate(sum_local); // dtype=float, dims=[4]
+#CHECK: Alias(sum_local,scale);
 #CHECK: sum[l1_inner + 4 * l1_outer] = (sum[l1_inner + 4 * l1_outer]) + (scale[((m1_1 + 12 * n1_1) + 1536 * l1_outer) + 384 * l1_inner]);
 #CHECK: for (int i = 0; i < 4
 #CHECK:   sum_local[i] = sum[i + 4 * l_outer];
@@ -1639,13 +1643,13 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) {
   l.prepareForCodegen();
 
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
+  SimpleIREvaluator cg(result, {a, b, e});
 
   // neither reduction body not cache changes.
   std::ostringstream oss;
-  oss << *result;
+  oss << *cg.stmt();
   const std::string& expected_ir =
       R"IR(
-#CHECK: Allocate(sum_local); // dtype=float, dims=[4]
 #CHECK: sum[l1] = (sum[l1]) + (scale[(m1_1 + 12 * n1_1) + 384 * l1]);
 #CHECK: for (int i = 0; i < 4
 #CHECK:   sum_local[i] = sum[i + 4 * l_outer];
@@ -1691,13 +1695,14 @@ TEST(Reductions, ReductionRfactorCacheTempOuter) {
   loop.simplify();
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
+  SimpleIREvaluator cg(s, {b, c, m, n, k});
 
   std::ostringstream oss;
-  oss << *s;
+  oss << *cg.stmt();
   const std::string& expected_ir =
       R"IR(
-#CHECK: Allocate(tmp); // dtype=float, dims=[n]
 #CHECK: Allocate(sum_rfac); // dtype=float, dims=[n]
+#CHECK: Allocate(tmp); // dtype=float, dims=[n]
 #CHECK: for (int a = 0; a < m
 #CHECK:   for (int i = 0; i < n
 #CHECK:     tmp[i] = 0
@@ -1714,7 +1719,6 @@ TEST(Reductions, ReductionRfactorCacheTempOuter) {
 #CHECK-NOT: tmp
       )IR";
   torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-  SimpleIREvaluator cg(s, {b, c, m, n, k});
 
   cg.call({in, out, M, N, K});
   ASSERT_EQ(out[0], 499500);
@@ -1757,13 +1761,14 @@ TEST(Reductions, ReductionRfactorCacheTempInner) {
   loop.prepareForCodegen();
   loop.simplify();
   StmtPtr s = loop.root_stmt();
+  SimpleIREvaluator cg(s, {b, c, m, n, k});
 
   std::ostringstream oss;
-  oss << *s;
+  oss << *cg.stmt();
   const std::string& expected_ir =
       R"IR(
-#CHECK: Allocate(tmp); // dtype=float, dims=[1]
 #CHECK: Allocate(sum_rfac); // dtype=float, dims=[n]
+#CHECK: Allocate(tmp); // dtype=float, dims=[1]
 #CHECK: for (int a = 0; a < m
 #CHECK:   for (int b = 0; b < n
 #CHECK:     tmp[0] = 0
@@ -1776,8 +1781,6 @@ TEST(Reductions, ReductionRfactorCacheTempInner) {
       )IR";
   torch::jit::testing::FileCheck().run(expected_ir, oss.str());
 
-  SimpleIREvaluator cg(s, {b, c, m, n, k});
-
   cg.call({in, out, M, N, K});
   ASSERT_EQ(out[0], 499500);
 }
diff --git a/test/fx2trt/converters/acc_op/test_reduce_ops.py b/test/fx2trt/converters/acc_op/test_reduce_ops.py
new file mode 100644
index 000000000000..5f8c412f3f02
--- /dev/null
+++ b/test/fx2trt/converters/acc_op/test_reduce_ops.py
@@ -0,0 +1,65 @@
+# Owner(s): ["oncall: fx"]
+
+import torch
+import torch.fx.experimental.fx_acc.acc_ops as acc_ops
+from torch.testing._internal.common_fx2trt import AccTestCase
+from parameterized import parameterized
+
+reduce_ops = [(torch.sum, acc_ops.sum), (torch.mean, acc_ops.mean)]
+
+class TestReduceConverter(AccTestCase):
+    @parameterized.expand(
+        case
+        for op, acc_op in reduce_ops
+        for case in
+        [
+            (f"{acc_op.__name__}_single_dim_no_keepdim", 1, False, op, acc_op),
+            (f"{acc_op.__name__}_single_dim_keepdim", 1, True, op, acc_op),
+            (f"{acc_op.__name__}_two_dim_no_keepdim", (1, 2), False, op, acc_op),
+            (f"{acc_op.__name__}_two_dim_keepdim", (1, 2), True, op, acc_op),
+            (f"{acc_op.__name__}_three_dim_no_keepdim", (1, 2, 3), False, op, acc_op),
+            (f"{acc_op.__name__}_three_dim_keepdim", (1, 2, 3), True, op, acc_op),
+            (f"{acc_op.__name__}_dim0_keepdim", 0, True, op, acc_op),
+            (f"{acc_op.__name__}_dim0_no_keepdim", 0, False, op, acc_op),
+        ]
+    )
+    def test_reduce(self, test_name, dim, keepdim, op, expected_acc_op):
+        class Reduce(torch.nn.Module):
+            def __init__(self, dim, keepdim):
+                super().__init__()
+                self.dim = dim
+                self.keepdim = keepdim
+
+            def forward(self, x):
+                return op(x, dim=self.dim, keepdim=self.keepdim)
+
+        inputs = [torch.randn(1, 2, 3, 4)]
+        self.run_test(
+            Reduce(dim, keepdim),
+            inputs,
+            expected_ops={expected_acc_op},
+            test_implicit_batch_dim=(dim != 0),
+        )
+
+    @parameterized.expand(
+        [
+            (f"{acc_op.__name__}_no_dim_no_keepdim", op, acc_op) for op, acc_op in reduce_ops
+        ]
+    )
+    def test_reduce_all_dims(
+        self,
+        test_name,
+        op,
+        expected_acc_op,
+    ):
+        class Reduce(torch.nn.Module):
+            def forward(self, x):
+                return op(x)
+
+        inputs = [torch.randn(1, 2, 3, 4)]
+        self.run_test(
+            Reduce(),
+            inputs,
+            expected_ops={expected_acc_op},
+            test_implicit_batch_dim=False,
+        )
diff --git a/test/fx2trt/converters/acc_op/test_sum.py b/test/fx2trt/converters/acc_op/test_sum.py
deleted file mode 100644
index b98f69b3de2b..000000000000
--- a/test/fx2trt/converters/acc_op/test_sum.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Owner(s): ["oncall: fx"]
-
-import torch
-import torch.fx.experimental.fx_acc.acc_ops as acc_ops
-from torch.testing._internal.common_fx2trt import AccTestCase
-from parameterized import parameterized
-
-
-class TestSumConverter(AccTestCase):
-    @parameterized.expand(
-        [
-            ("single_dim_no_keepdim", 1, False),
-            ("single_dim_keepdim", 1, True),
-            ("two_dim_no_keepdim", (1, 2), False),
-            ("two_dim_keepdim", (1, 2), True),
-            ("three_dim_no_keepdim", (1, 2, 3), False),
-            ("three_dim_keepdim", (1, 2, 3), True),
-            ("dim0_keepdim", 0, True),
-            ("dim0_no_keepdim", 0, False),
-        ]
-    )
-    def test_sum(self, test_name, dim, keepdim):
-        class Sum(torch.nn.Module):
-            def __init__(self, dim, keepdim):
-                super().__init__()
-                self.dim = dim
-                self.keepdim = keepdim
-
-            def forward(self, x):
-                return x.sum(dim=self.dim, keepdim=self.keepdim)
-
-        inputs = [torch.randn(1, 2, 3, 4)]
-        self.run_test(
-            Sum(dim, keepdim),
-            inputs,
-            expected_ops={acc_ops.sum},
-            test_implicit_batch_dim=(dim != 0),
-        )
-
-    @parameterized.expand(
-        [
-            ("no_dim_no_keepdim"),
-        ]
-    )
-    def test_sum_explicit_only(
-        self,
-        test_name,
-    ):
-        class Sum(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.sum(x)
-
-        inputs = [torch.randn(1, 2, 3, 4)]
-        self.run_test(
-            Sum(),
-            inputs,
-            expected_ops={acc_ops.sum},
-            test_implicit_batch_dim=False,
-        )
diff --git a/test/fx_acc/test_acc_tracer.py b/test/fx_acc/test_acc_tracer.py
index fcf531070172..28f610822589 100644
--- a/test/fx_acc/test_acc_tracer.py
+++ b/test/fx_acc/test_acc_tracer.py
@@ -105,11 +105,12 @@ class AccTracerTest(unittest.TestCase):
                 )
 
     def test_sum(self):
-        def torch_sum(x, *args, **kwargs):
-            return x.sum(*args, **kwargs)
+        self._make_acc_op_function_test(acc_ops.sum, torch.sum)
+        self._make_acc_op_function_test(acc_ops.sum, torch.sum, dim=(1,), keepdim=True)
 
-        self._make_acc_op_function_test(acc_ops.sum, torch_sum)
-        self._make_acc_op_function_test(acc_ops.sum, torch_sum, dim=(1,), keepdim=True)
+    def test_mean(self):
+        self._make_acc_op_function_test(acc_ops.mean, torch.mean)
+        self._make_acc_op_function_test(acc_ops.mean, torch.mean, dim=(1,), keepdim=True)
 
     def test_pad(self):
         self._make_acc_op_function_test(acc_ops.pad, torch.nn.functional.pad, pad=(2, 0))
@@ -1943,6 +1944,7 @@ class AccTracerTest(unittest.TestCase):
                 acc_ops.linalg_norm,
                 acc_ops.slice_tensor,
                 acc_ops.hardsigmoid,
+                acc_ops.mean,
                 acc_ops.hardtanh,
                 acc_ops.gelu,
                 acc_ops.cumsum,
diff --git a/test/jit/test_dtype_analysis.py b/test/jit/test_dtype_analysis.py
index 64aa3ba8865a..0e4a496ee244 100644
--- a/test/jit/test_dtype_analysis.py
+++ b/test/jit/test_dtype_analysis.py
@@ -9,7 +9,7 @@ from torch.testing._internal.common_methods_invocations import (
     sample_inputs_adaptive_avg_pool2d,
     sample_inputs_conv2d,
 )
-from torch.testing._internal.common_utils import set_default_dtype
+from torch.testing._internal.common_utils import set_default_dtype, first_sample
 from torch.testing._internal.jit_utils import JitTestCase
 from torch.testing._internal.jit_metaprogramming_utils import create_traced_fn
 from torch.testing._internal.common_device_type import (
@@ -340,7 +340,8 @@ class TestDtypeCustomRules(TestDtypeBase):
 
     def custom_rules_test_base(self, device, dtype, op, allow_eager_fail=False):
         try:
-            sample_input = op.sample_inputs(device, dtype, requires_grad=False)[0]
+            samples = op.sample_inputs(device, dtype, requires_grad=False)
+            sample_input = first_sample(self, samples)
             input_args = [sample_input.input, *sample_input.args]
             expected_res = op(*input_args, **sample_input.kwargs)
 
diff --git a/test/quantization/ao_migration/test_quantization_fx.py b/test/quantization/ao_migration/test_quantization_fx.py
index 866f443fe8be..23ee9c580005 100644
--- a/test/quantization/ao_migration/test_quantization_fx.py
+++ b/test/quantization/ao_migration/test_quantization_fx.py
@@ -185,7 +185,6 @@ class TestAOMigrationQuantizationFx(AOMigrationTestCase):
 
     def test_function_import_fx_utils(self):
         function_list = [
-            '_parent_name',
             'graph_pretty_str',
             'get_per_tensor_qparams',
             'quantize_node',
diff --git a/test/quantization/dbr/test_quantize_dbr.py b/test/quantization/dbr/test_quantize_dbr.py
index 6def3495127e..c9b9a427ff1c 100644
--- a/test/quantization/dbr/test_quantize_dbr.py
+++ b/test/quantization/dbr/test_quantize_dbr.py
@@ -7,10 +7,13 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.nn.intrinsic as nni
+import torch.nn.quantized as nnq
+toq = torch.ops.quantized
 from torch.testing._internal.common_quantization import (
     skipIfNoFBGEMM,
     skip_if_no_torchvision,
     QuantizationTestCase,
+    NodeSpec,
 )
 from torch.quantization import (
     ObserverBase,
@@ -51,10 +54,10 @@ class QuantizeDBRTestCase(QuantizationTestCase):
     ):
         m_copy = copy.deepcopy(m)
 
-        m.qconfig = qconfig
+        qconfig_dict = {'': qconfig}
 
         mp = _quantize_dbr.prepare(
-            m, example_args, fuse_modules=fuse_modules)
+            m, qconfig_dict, example_args, fuse_modules=fuse_modules)
         out_p = mp(*example_args)
         # print(mp)
         mq = _quantize_dbr.convert(mp)
@@ -125,8 +128,8 @@ class TestQuantizeDBR(QuantizeDBRTestCase):
                 return x
 
         m = M().eval()
-        m.qconfig = torch.quantization.default_qconfig
-        mp = _quantize_dbr.prepare(m, (torch.randn(1, 1, 1, 1),))
+        qconfig = torch.quantization.default_qconfig
+        mp = _quantize_dbr.prepare(m, {'': qconfig}, (torch.randn(1, 1, 1, 1),))
         self.assertTrue(isinstance(mp.conv, nni.ConvReLU2d))
         self.assertTrue(isinstance(mp.child[0], nni.ConvReLU2d))
 
@@ -176,8 +179,8 @@ class TestQuantizeDBR(QuantizeDBRTestCase):
         stored in observers and fake quants.
         """
         m = nn.Sequential(nn.Conv2d(1, 1, 1)).eval()
-        m.qconfig = torch.quantization.default_qconfig
-        mp = _quantize_dbr.prepare(m, (torch.randn(1, 1, 1, 1),))
+        qconfig = torch.quantization.default_qconfig
+        mp = _quantize_dbr.prepare(m, {'': qconfig}, (torch.randn(1, 1, 1, 1),))
         for _, mod in mp.named_modules():
             if isinstance(mod, (ObserverBase, FakeQuantizeBase)):
                 scale, zp = mod.calculate_qparams()
@@ -229,10 +232,9 @@ class TestQuantizeDBR(QuantizeDBRTestCase):
             copy.deepcopy(m), qconfig, (torch.randn(1, 1, 2, 2),))
 
         # test backprop does not crash
-        m.qconfig = qconfig
         inputs = torch.randn(1, 1, 1, 1)
         inputs.requires_grad = True
-        mp = _quantize_dbr.prepare(m, (inputs,))
+        mp = _quantize_dbr.prepare(m, {'': qconfig}, (inputs,))
         output = mp(inputs)
         labels = torch.randn(1, 1, 1, 1)
         loss = (output - labels).sum()
@@ -263,10 +265,9 @@ class TestQuantizeDBR(QuantizeDBRTestCase):
         self._test_auto_tracing(m, qconfig, (torch.randn(1, 1, 2, 2),))
 
         # test backprop does not crash
-        m.qconfig = qconfig
         inputs = torch.randn(1, 1, 1, 1)
         inputs.requires_grad = True
-        mp = _quantize_dbr.prepare(m, (inputs,))
+        mp = _quantize_dbr.prepare(m, {'': qconfig}, (inputs,))
         output = mp(inputs)
         labels = torch.randn(1, 1, 1, 1)
         loss = (output - labels).sum()
@@ -946,9 +947,9 @@ class TestQuantizeDBR(QuantizeDBRTestCase):
                 return x
 
         m = M().eval()
-        m.qconfig = torch.quantization.default_qconfig
+        qconfig = torch.quantization.default_qconfig
         example_args = (torch.randn(1, 1, 2, 2),)
-        mp = _quantize_dbr.prepare(m, example_args)
+        mp = _quantize_dbr.prepare(m, {'': qconfig}, example_args)
         out_p = mp(*example_args)
         mq = _quantize_dbr.convert(copy.deepcopy(mp))
         out_q = mq(*example_args)
@@ -975,6 +976,153 @@ class TestQuantizeDBR(QuantizeDBRTestCase):
                 v['node_output']['mq'][0]['ref_node_target_type'],
                 v['node_output']['mq'][0]['sqnr']])
 
+    def test_qconfig_dict_global(self):
+        """
+        Verifies that the '' option of qconfig_dict works
+        """
+
+        # regular case
+        m = nn.Sequential(nn.Conv2d(1, 1, 1))
+        qconfig_dict = {'': torch.quantization.default_qconfig}
+        example_args = (torch.randn(1, 1, 1, 1),)
+        mp = _quantize_dbr.prepare(m, qconfig_dict, example_args)
+        mp(*example_args)
+        mq = _quantize_dbr.convert(mp)
+        mq(*example_args)
+        self.assertTrue(isinstance(mq[0], nnq.Conv2d))
+
+        # quantization turned off
+        m = nn.Sequential(nn.Conv2d(1, 1, 1))
+        qconfig_dict = {'': None}
+        example_args = (torch.randn(1, 1, 1, 1),)
+        mp = _quantize_dbr.prepare(m, qconfig_dict, example_args)
+        mp(*example_args)
+        mq = _quantize_dbr.convert(mp)
+        mq(*example_args)
+        self.assertTrue(isinstance(mq[0], nn.Conv2d))
+
+    def test_qconfig_dict_object_type_module(self):
+        """
+        Verifies that the 'object_type' option of qconfig_dict works
+        on module types.
+        """
+        m = nn.Sequential(
+            nn.Conv2d(1, 1, 1),
+            nn.Hardswish(),
+            nn.Conv2d(1, 1, 1),
+        )
+        qconfig_dict = {
+            '': torch.quantization.default_qconfig,
+            'object_type': [
+                (nn.Conv2d, torch.quantization.default_qconfig),
+                (nn.Hardswish, None),
+            ],
+        }
+        example_args = (torch.randn(1, 1, 1, 1),)
+        mp = _quantize_dbr.prepare(m, qconfig_dict, example_args)
+        mp(*example_args)
+        mq = _quantize_dbr.convert(mp)
+        mq(*example_args)
+        self.assertTrue(isinstance(mq[0], nnq.Conv2d))
+        self.assertTrue(isinstance(mq[1], nn.Hardswish))
+        self.assertTrue(isinstance(mq[2], nnq.Conv2d))
+
+    def test_qconfig_dict_object_type_function(self):
+        """
+        Verifies that the 'object_type' option of qconfig_dict works
+        on function types.
+        """
+        class M(nn.Module):
+            def forward(self, x):
+                x = x + x
+                x = x * x
+                return x
+
+        m = M()
+        qconfig_dict = {
+            '': torch.quantization.default_qconfig,
+            'object_type': [
+                (torch.add, None),
+            ],
+        }
+        example_args = (torch.randn(1, 1, 1, 1),)
+        mp = _quantize_dbr.prepare(m, qconfig_dict, example_args)
+        mp(*example_args)
+        mq = _quantize_dbr.convert(mp)
+        mq(*example_args)
+        rewritten = mq.rewrite_for_scripting()
+        expected_occurrence = {
+            NodeSpec.call_function(torch.add): 1,
+            NodeSpec.call_function(toq.add): 0,
+            NodeSpec.call_function(toq.mul): 1,
+        }
+        self.checkGraphModuleNodes(
+            rewritten, expected_node_occurrence=expected_occurrence)
+
+    def test_qconfig_dict_object_type_function_global_none(self):
+        """
+        Verifies that the 'object_type' option of qconfig_dict works
+        on function types when global qconfig is None.
+        """
+        class M(nn.Module):
+            def forward(self, x):
+                x = x + x
+                return x
+
+        m = M()
+        qconfig_dict = {
+            '': None,
+            'object_type': [
+                (torch.add, torch.quantization.default_qconfig),
+            ],
+        }
+        example_args = (torch.randn(1, 1, 1, 1),)
+        mp = _quantize_dbr.prepare(m, qconfig_dict, example_args)
+        mp(*example_args)
+        mq = _quantize_dbr.convert(mp)
+        mq(*example_args)
+        rewritten = mq.rewrite_for_scripting()
+        expected_occurrence = {
+            NodeSpec.call_function(torch.add): 0,
+            NodeSpec.call_function(toq.add): 1,
+        }
+        self.checkGraphModuleNodes(
+            rewritten, expected_node_occurrence=expected_occurrence)
+
+    def test_qconfig_dict_module_name(self):
+        """
+        Verifies that the 'module_name' option of qconfig_dict works
+        on module types.
+        """
+        m = nn.Sequential(
+            nn.Sequential(
+                nn.Conv2d(1, 1, 1),
+            ),
+            nn.Conv2d(1, 1, 1),
+            nn.Sequential(
+                nn.Conv2d(1, 1, 1),
+                nn.Conv2d(1, 1, 1),
+            ),
+        )
+        qconfig_dict = {
+            '': torch.quantization.default_qconfig,
+            'module_name': [
+                ('0', torch.quantization.default_qconfig),
+                ('1', None),
+                ('2.0', None),
+            ],
+        }
+        example_args = (torch.randn(1, 1, 1, 1),)
+        mp = _quantize_dbr.prepare(m, qconfig_dict, example_args)
+        mp(*example_args)
+        mq = _quantize_dbr.convert(mp)
+        mq(*example_args)
+        self.assertTrue(isinstance(mq[0][0], nnq.Conv2d))
+        self.assertTrue(isinstance(mq[1], nn.Conv2d))
+        self.assertTrue(isinstance(mq[2][0], nn.Conv2d))
+        self.assertTrue(isinstance(mq[2][1], nnq.Conv2d))
+
+
 @skipIfNoFBGEMM
 class TestQuantizeDBRModels(QuantizeDBRTestCase):
     @skip_if_no_torchvision
diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 8829c507d8e1..97e172da09a5 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -650,14 +650,22 @@ class IDP_NoLen(IterDataPipe):
             yield i
 
 
-def _fake_fn(data, *args, **kwargs):
+def _fake_fn(data):
     return data
 
 
-def _fake_filter_fn(data, *args, **kwargs):
+def _fake_add(constant, data):
+    return constant + data
+
+
+def _fake_filter_fn(data):
     return data >= 5
 
 
+def _fake_filter_fn_constant(constant, data):
+    return data >= constant
+
+
 def _worker_init_fn(worker_id):
     random.seed(123)
 
@@ -669,10 +677,12 @@ class TestFunctionalIterDataPipe(TestCase):
         arr = range(10)
         picklable_datapipes: List[Tuple[Type[IterDataPipe], IterDataPipe, Tuple, Dict[str, Any]]] = [
             (dp.iter.Mapper, dp.iter.IterableWrapper(arr), (), {}),
-            (dp.iter.Mapper, dp.iter.IterableWrapper(arr), (_fake_fn, (0, ), {'test': True}), {}),
+            (dp.iter.Mapper, dp.iter.IterableWrapper(arr), (_fake_fn, (0, )), {}),
+            (dp.iter.Mapper, dp.iter.IterableWrapper(arr), (partial(_fake_add, 1), (0,)), {}),
             (dp.iter.Collator, dp.iter.IterableWrapper(arr), (), {}),
-            (dp.iter.Collator, dp.iter.IterableWrapper(arr), (_fake_fn, (0, ), {'test': True}), {}),
-            (dp.iter.Filter, dp.iter.IterableWrapper(arr), (_fake_filter_fn, (0, ), {'test': True}), {}),
+            (dp.iter.Collator, dp.iter.IterableWrapper(arr), (_fake_fn, (0, )), {}),
+            (dp.iter.Filter, dp.iter.IterableWrapper(arr), (_fake_filter_fn, (0, )), {}),
+            (dp.iter.Filter, dp.iter.IterableWrapper(arr), (partial(_fake_filter_fn, 5), (0,)), {}),
         ]
         for dpipe, input_dp, dp_args, dp_kwargs in picklable_datapipes:
             p = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs))  # type: ignore[call-arg]
@@ -1035,9 +1045,6 @@ class TestFunctionalIterDataPipe(TestCase):
             pass
         traverse(dp2)  # This should not raise any error either
 
-
-
-    @suppress_warnings  # Suppress warning for lambda fn
     def test_map_datapipe(self):
         input_dp = dp.iter.IterableWrapper(range(10))
 
@@ -1050,12 +1057,6 @@ class TestFunctionalIterDataPipe(TestCase):
         for x, y in zip(map_dp, input_dp):
             self.assertEqual(x, torch.tensor(y, dtype=torch.float))
 
-        map_dp = input_dp.map(fn=fn, fn_args=(torch.int, ), fn_kwargs={'sum': True})
-        self.assertEqual(len(input_dp), len(map_dp))
-        for x, y in zip(map_dp, input_dp):
-            self.assertEqual(x, torch.tensor(y, dtype=torch.int).sum())
-
-        from functools import partial
         map_dp = input_dp.map(partial(fn, dtype=torch.int, sum=True))
         self.assertEqual(len(input_dp), len(map_dp))
         for x, y in zip(map_dp, input_dp):
@@ -1333,7 +1334,6 @@ class TestFunctionalIterDataPipe(TestCase):
         _helper(batch_size=3, drop_last=True, batch_num=2, sort_key=_sort_fn)
         _helper(batch_size=3, drop_last=True, batch_num=2, bucket_num=2, sort_key=_sort_fn)
 
-
     def test_filter_datapipe(self):
         input_ds = dp.iter.IterableWrapper(range(10))
 
@@ -1342,11 +1342,11 @@ class TestFunctionalIterDataPipe(TestCase):
                 return data >= val
             return True
 
-        filter_dp = input_ds.filter(filter_fn=_filter_fn, fn_args=(5, ))
+        filter_dp = input_ds.filter(partial(_filter_fn, val=5))
         for data, exp in zip(filter_dp, range(10)):
             self.assertEqual(data, exp)
 
-        filter_dp = input_ds.filter(filter_fn=_filter_fn, fn_kwargs={'val': 5, 'clip': True})
+        filter_dp = input_ds.filter(partial(_filter_fn, val=5, clip=True))
         for data, exp in zip(filter_dp, range(5, 10)):
             self.assertEqual(data, exp)
 
@@ -1427,7 +1427,8 @@ class TestFunctionalMapDataPipe(TestCase):
             Tuple[Type[MapDataPipe], MapDataPipe, Tuple, Dict[str, Any]]
         ] = [
             (dp.map.Mapper, dp.map.SequenceWrapper(arr), (), {}),
-            (dp.map.Mapper, dp.map.SequenceWrapper(arr), (_fake_fn, (0,), {'test': True}), {}),
+            (dp.map.Mapper, dp.map.SequenceWrapper(arr), (_fake_fn, (0,)), {}),
+            (dp.map.Mapper, dp.map.SequenceWrapper(arr), (partial(_fake_add, 1), (0,)), {}),
         ]
         for dpipe, input_dp, dp_args, dp_kwargs in picklable_datapipes:
             p = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs))  # type: ignore[call-arg]
@@ -1540,7 +1541,6 @@ class TestFunctionalMapDataPipe(TestCase):
         shuffler_dp = input_dp1.shuffle()
         self.assertEqual(10, len(shuffler_dp))
 
-
     def test_map_datapipe(self):
         arr = range(10)
         input_dp = dp.map.SequenceWrapper(arr)
@@ -1556,15 +1556,6 @@ class TestFunctionalMapDataPipe(TestCase):
                 map_dp[index], torch.tensor(input_dp[index], dtype=torch.float)
             )
 
-        map_dp = input_dp.map(fn=fn, fn_args=(torch.int,), fn_kwargs={'sum': True})
-        self.assertEqual(len(input_dp), len(map_dp))
-        for index in arr:
-            self.assertEqual(
-                map_dp[index], torch.tensor(input_dp[index], dtype=torch.int).sum()
-            )
-
-        from functools import partial
-
         map_dp = input_dp.map(partial(fn, dtype=torch.int, sum=True))
         self.assertEqual(len(input_dp), len(map_dp))
         for index in arr:
diff --git a/test/test_nn.py b/test/test_nn.py
index bbb686a843a2..94d0854f6499 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -1834,11 +1834,128 @@ class TestNN(NNTestCase):
         parameters.pop('p4')
         check()
 
+        # Check reverse works
+        forward = list(iter(parameter_dict))
+        backward = list(reversed(parameter_dict))
+        self.assertEqual(len(forward), len(backward))
+        n = len(forward)
+        for i in range(n):
+            self.assertIs(forward[i], backward[n - i - 1])
+        check()
+
+        # Check copy works
+        copy = parameter_dict.copy()
+
+        # Check all keys are present and have shallow copied values
+        for key in parameter_dict:
+            self.assertTrue(key in copy)
+            self.assertEqual(parameter_dict[key], copy[key])
+            self.assertIs(parameter_dict[key], copy[key])
+        check()
+
+        parameter_dict["p20"] = Parameter(torch.randn(10, 10))
+        copy["p21"] = Parameter(torch.randn(9, 10))
+
+        self.assertTrue("p20" in parameter_dict)
+        self.assertFalse("p20" in copy)
+        self.assertFalse("p21" in parameter_dict)
+        self.assertTrue("p21" in copy)
+        parameter_dict.pop("p20")
+        check()
+
+        p = Parameter(torch.randn(10, 10))
+        parameter_dict['p12'] = p
+        p_popitem = parameter_dict.popitem()
+        self.assertEqual(p_popitem[0], 'p12')
+        self.assertIs(p_popitem[1], p)
+
+        # Unit test for set_default
+        # 1. Ensure parameter is correctly inserted when
+        #    the key is not present in `ParameterDict`
+        assert 'p11' not in parameter_dict
+        parameters['p11'] = Parameter(torch.randn(10, 10))
+        p_setdefault = parameter_dict.setdefault('p11', parameters['p11'])
+        self.assertIs(p_setdefault, parameters['p11'])
+        # 2. Ensure parameter is NOT inserted when the
+        #    key is already present in `ParameterDict`
+        p = Parameter(torch.randn(10, 10))
+        self.assertFalse(parameter_dict.setdefault('p11', p) is p)
+        # 3. Ensure `None` is inserted when the key is not
+        #    present in `Parameter` and parameter is not specified
+        self.assertIs(parameter_dict.setdefault('p26'), None)
+        del parameter_dict['p26']
+        check()
+
+        parameters2 = OrderedDict([
+            ('p13', Parameter(torch.randn(10, 10))),
+            ('p2', Parameter(torch.randn(10, 10))),
+            ('p3', Parameter(torch.randn(10, 10))),
+        ])
+        parameter_dict2 = nn.ParameterDict(parameters2)
+        parameters.update(parameters2)
+        parameter_dict |= parameter_dict2
+        check()
+
+        parameters2 = OrderedDict()
+        parameter_dict2 = nn.ParameterDict(parameters2)
+        parameters.update(parameters2)
+        parameter_dict |= parameter_dict2
+        check()
+
+        parameters2 = OrderedDict([
+            ('p14', Parameter(torch.randn(10, 10))),
+            ('p15', Parameter(torch.randn(10, 10))),
+            ('p13', Parameter(torch.randn(10, 10))),
+        ])
+        parameter_dict2 = nn.ParameterDict(parameters2)
+        parameters.update(parameters2)
+        parameter_dict |= parameter_dict2
+        check()
+
+        # Check __or__ and __ror__ works
+        parameters2 = OrderedDict([
+            ('p20', Parameter(torch.randn(10, 10))),
+            ('p21', Parameter(torch.randn(10, 10))),
+            ('p22', Parameter(torch.randn(10, 10))),
+        ])
+        parameter_dict2 = nn.ParameterDict(parameters2)
+        parameters.update(parameters2)
+        parameter_dict = parameter_dict | parameter_dict2
+        check()
+
+        parameters2 = OrderedDict([
+            ('p23', Parameter(torch.randn(10, 10))),
+            ('p24', Parameter(torch.randn(10, 10))),
+            ('p25', Parameter(torch.randn(10, 10))),
+        ])
+        parameter_dict2 = nn.ParameterDict(parameters2)
+        parameters2.update(parameters)
+        parameters = parameters2
+        parameter_dict = parameter_dict2 | parameter_dict
+        check()
+
+        parameters['p17'] = Parameter(torch.randn(10, 10))
+        parameter_dict['p17'] = parameters['p17']
+        self.assertIs(parameters['p17'], parameter_dict.get('p17'))
+        temp_param = Parameter(torch.randn(10, 10))
+        self.assertIs(parameters['p17'], parameter_dict.get('p17', temp_param))
+        self.assertIs(None, parameter_dict.get('p18'))
+        self.assertIs(temp_param, parameter_dict.get('p18', temp_param))
+        check()
+
         parameter_dict.clear()
         self.assertEqual(len(parameter_dict), 0)
         parameters.clear()
         check()
 
+        parameter_dict2 = parameter_dict.fromkeys(['p19', 'p20'])
+        self.assertEqual({'p19': None, 'p20': None}, parameter_dict2)
+        check()
+
+        parameter_dict2 = parameter_dict.fromkeys(['p19', 'p20'], temp_param)
+        self.assertEqual({'p19': temp_param, 'p20': temp_param}, parameter_dict2)
+        check()
+
     def test_add_module(self):
         methods_to_test = ['add_module', 'register_module']
         for fn in methods_to_test:
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index b2ef98aba037..c78d713f003e 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -603,11 +603,11 @@ class TestSparseCSR(TestCase):
             ),
             shape=a.shape,
         )
-        expected = alpha * (a_bsr * b.cpu().numpy()) + beta * c.cpu().numpy()
+        expected = alpha * (a_bsr * b.cpu().resolve_conj().numpy()) + beta * c.cpu().numpy()
         self.assertEqual(actual, out)
         self.assertEqual(actual, expected)
 
-    @onlyCUDA
+    @skipCPUIfNoMklSparse
     @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_block_addmm(self, device, dtype):
@@ -623,7 +623,7 @@ class TestSparseCSR(TestCase):
                 for op_b, op_out in itertools.product([True, False], repeat=2):
                     self.run_test_block_addmm_addmv(torch.addmm, c, a, b, op_b, op_out, dtype=dtype, device=device)
 
-    @onlyCUDA
+    @skipCPUIfNoMklSparse
     @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_block_addmv(self, device, dtype):
diff --git a/third_party/cuda.BUILD b/third_party/cuda.BUILD
index 0c58b34a52e7..a948415f9138 100644
--- a/third_party/cuda.BUILD
+++ b/third_party/cuda.BUILD
@@ -1,43 +1,76 @@
-"""
-Collect all the CUDA stuff from @local_config_cuda in a single target
-for convenience.
-"""
+# Adopted from: https://github.com/tensorflow/runtime/blob/master/third_party/rules_cuda/private/BUILD.local_cuda
+# Library targets are created corresponding to BUILD.bazel's needs.
+
+cc_library(
+    name = "cuda_headers",
+    hdrs = glob([
+        "include/**",
+        "targets/x86_64-linux/include/**",
+    ]),
+    includes = [
+        "include",
+        "targets/x86_64-linux/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda_driver",
+    srcs = ["lib64/stubs/libcuda.so"],
+    visibility = ["//visibility:public"],
+)
 
 cc_library(
     name = "cuda",
+    srcs = ["targets/x86_64-linux/lib/libcudart.so"],
     visibility = ["//visibility:public"],
-    deps = [
-        "@local_config_cuda//cuda:cublas",
-        "@local_config_cuda//cuda:cuda_driver",
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_cuda//cuda:cudart",
-        "@local_config_cuda//cuda:cufft",
-        "@local_config_cuda//cuda:curand",
-    ],
+    deps = [":cuda_headers"],
 )
 
 cc_library(
-    name = "cupti",
-    deps = [
-        "@local_config_cuda//cuda:cupti_headers",
-        "@local_config_cuda//cuda:cupti_link",
-    ],
+    name = "cufft",
+    srcs = ["targets/x86_64-linux/lib/libcufft.so"],
+    visibility = ["//visibility:public"],
 )
 
-[
-    alias(
-        name = lib,
-        actual = "@local_config_cuda//cuda:{}".format(lib),
-        visibility = ["//visibility:public"],
-    )
-    for lib in [
-        "cublas",
-        "cufft",
-        "cusolver",
-        "cusparse",
-        "curand",
-        "nvrtc",
-        "cuda_driver",
-        "nvToolsExt",
-    ]
-]
+cc_library(
+    name = "cublas",
+    srcs = [
+        "targets/x86_64-linux/lib/libcublasLt.so",
+        "targets/x86_64-linux/lib/libcublas.so",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "curand",
+    srcs = ["targets/x86_64-linux/lib/libcurand.so"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cusolver",
+    srcs = ["targets/x86_64-linux/lib/libcusolver.so"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cusparse",
+    srcs = ["targets/x86_64-linux/lib/libcusparse.so"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "nvrtc",
+    srcs = [
+        "targets/x86_64-linux/lib/libnvrtc.so",
+        "targets/x86_64-linux/lib/libnvrtc-builtins.so",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "nvToolsExt",
+    srcs = [ "lib64/libnvToolsExt.so"],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/cudnn.BUILD b/third_party/cudnn.BUILD
new file mode 100644
index 000000000000..03736508a68c
--- /dev/null
+++ b/third_party/cudnn.BUILD
@@ -0,0 +1,26 @@
+# Adopted from: https://github.com/NVIDIA/TRTorch/blob/master/third_party/cudnn/local/BUILD
+
+cc_library(
+    name = "cudnn_headers",
+    hdrs = ["include/cudnn.h"] + glob([
+        "include/cudnn+.h",
+        "include/cudnn_*.h",
+    ]),
+    includes = ["include/"],
+    visibility = ["//visibility:private"],
+)
+
+cc_import(
+    name = "cudnn_lib",
+    shared_library = "lib/x86_64-linux-gnu/libcudnn.so",
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "cudnn",
+    visibility = ["//visibility:public"],
+    deps = [
+        "cudnn_headers",
+        "cudnn_lib",
+    ],
+)
diff --git a/third_party/gloo.BUILD b/third_party/gloo.BUILD
index 5db68095fee3..3f623e54e6ad 100644
--- a/third_party/gloo.BUILD
+++ b/third_party/gloo.BUILD
@@ -48,8 +48,8 @@ cc_library(
 cu_library(
     name = "gloo_cuda",
     srcs = [
-        "gloo/cuda.cu.cc",
-        "gloo/cuda_private.cu.cc",
+        "gloo/cuda.cu",
+        "gloo/cuda_private.cu",
     ],
     visibility = ["//visibility:public"],
     deps = [
@@ -72,8 +72,8 @@ cc_library(
             "gloo/cuda*.cc",
             "gloo/common/win.cc",
             "gloo/rendezvous/redis_store.cc",
-        ],
-    ),
+        ]
+    ) + if_cuda(glob(["gloo/cuda*.cc"])),
     copts = [
         "-std=gnu++11",
         "-std=c++11",
diff --git a/third_party/substitution.bzl b/third_party/substitution.bzl
index db376ebfe52b..7b14b3c8a1c3 100644
--- a/third_party/substitution.bzl
+++ b/third_party/substitution.bzl
@@ -58,7 +58,7 @@ def header_template_rule_impl(ctx):
         CcInfo(compilation_context = cc_common.create_compilation_context(
 
             # pass out the include path for finding this header
-            includes = depset([ctx.outputs.out.dirname, ctx.bin_dir.path]),
+            system_includes = depset([ctx.attr.include, ctx.outputs.out.dirname, ctx.bin_dir.path]),
 
             # and the actual header here.
             headers = depset([ctx.outputs.out]),
@@ -68,6 +68,7 @@ def header_template_rule_impl(ctx):
 header_template_rule = rule(
     attrs = {
         "out": attr.output(mandatory = True),
+        "include": attr.string(),
         "src": attr.label(
             mandatory = True,
             allow_single_file = True,
diff --git a/third_party/tensorflow_cuda_bazel_build/BUILD b/third_party/tensorflow_cuda_bazel_build/BUILD
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/third_party/tensorflow_cuda_bazel_build/README.md b/third_party/tensorflow_cuda_bazel_build/README.md
deleted file mode 100644
index 439e195d8e44..000000000000
--- a/third_party/tensorflow_cuda_bazel_build/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Config for CUDA
-
-This is a checked-in copy of the auto-generated config for building CUDA code with bazel. The content of this folder was generated from https://github.com/tensorflow/tensorflow `./configure` execution and then edited manually to fit the pytorch needs.
-
-The LICENSE for the TensorFlow project is APACHE 2. The full LICENSE file could be found here https://github.com/tensorflow/tensorflow/blob/master/LICENSE.
diff --git a/third_party/tensorflow_cuda_bazel_build/WORKSPACE b/third_party/tensorflow_cuda_bazel_build/WORKSPACE
deleted file mode 100644
index 59369ce679c1..000000000000
--- a/third_party/tensorflow_cuda_bazel_build/WORKSPACE
+++ /dev/null
@@ -1 +0,0 @@
-workspace(name = "local_config_cuda")
diff --git a/third_party/tensorflow_cuda_bazel_build/cuda/BUILD b/third_party/tensorflow_cuda_bazel_build/cuda/BUILD
deleted file mode 100755
index f7271af2750b..000000000000
--- a/third_party/tensorflow_cuda_bazel_build/cuda/BUILD
+++ /dev/null
@@ -1,451 +0,0 @@
-licenses([
-    "restricted",
-    "reciprocal",
-    "notice",
-])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "using_nvcc",
-    values = {
-        "define": "using_cuda_nvcc=true",
-    },
-)
-
-config_setting(
-    name = "using_clang",
-    values = {
-        "define": "using_cuda_clang=true",
-    },
-)
-
-# Equivalent to using_clang && -c opt.
-config_setting(
-    name = "using_clang_opt",
-    values = {
-        "define": "using_cuda_clang=true",
-        "compilation_mode": "opt",
-    },
-)
-
-config_setting(
-    name = "darwin",
-    values = {"cpu": "darwin"},
-)
-
-cc_library(
-    name = "cuda_headers",
-    hdrs = [
-        ":cuda-include",
-        ":cudnn-include",
-    ],
-    includes = [
-        ".",
-        "include",
-    ],
-)
-
-cc_library(
-    name = "cudnn_headers",
-    hdrs = [
-        ":cudnn-include",
-    ],
-    includes = [
-        ".",
-        "include",
-    ],
-)
-
-cc_library(
-    name = "cudart_static",
-    linkopts = [
-        "-L/usr/local/cuda/lib64",
-    ],
-)
-
-cc_library(
-    name = "cuda_driver",
-    linkopts = ["-lcuda"],
-    deps = [":linker_search_path"],
-)
-
-# Provides the RPATH for Nvidia-less sytems to be able to run binaries linked to libcuda.
-cc_library(
-    name = "driver_stub_runtime",
-    linkopts = [
-        "-Wl,-rpath,/usr/local/cuda/lib64/stubs",
-    ],
-    deps = [":cuda_driver"],
-)
-
-cc_library(
-    name = "linker_search_path",
-    linkopts = [
-        "-L/usr/local/cuda/lib64",
-        "-L/usr/local/cuda/lib64/stubs",
-        "-Wl,-rpath-link,/usr/local/cuda/lib64",
-        "-Wl,-rpath-link,/usr/local/cuda/lib64/stubs",
-    ],
-)
-
-[
-    cc_library(
-        name = libname,
-        linkopts = ["-l" + libname] + (["-lgomp"] if (libname == "cusolver") else []),
-        linkstatic = True,
-        deps = [":linker_search_path"],
-    )
-    for libname in [
-        "cublas",
-        "cudart",
-        "cudnn",
-        "cufft",
-        "curand",
-        "cusolver",
-        "cusparse",
-        "nvrtc",
-        "nvToolsExt",
-    ]
-]
-
-cc_library(
-    name = "cuda",
-    deps = [
-        ":cublas",
-        ":cuda_headers",
-        ":cudart",
-        ":cudnn",
-        ":cufft",
-        ":curand",
-        ":nvToolsExt",
-    ],
-)
-
-# NVIDIA Performance Primitives (http://docs.nvidia.com/cuda/npp/modules.html))
-# used by OpenCV
-cc_library(
-    name = "nppi",
-    linkopts = [
-        "-lnppc",
-        "-lnppial",
-        "-lnppicom",
-        "-lnppidei",
-        "-lnppif",
-        "-lnppig",
-        "-lnppim",
-        "-lnppist",
-        "-lnppitc",
-        "-lnpps",
-    ],
-    linkstatic = True,
-    deps = [":linker_search_path"],
-)
-
-# NVIDIA Management Library
-cc_library(
-    name = "nvml",
-    linkopts = [
-        "-lnvidia-ml",
-        "-Wl,-rpath,/usr/lib/nvidia-410",
-        "-Wl,-rpath,/usr/lib/nvidia-390",
-        "-Wl,-rpath,/usr/lib/nvidia-387",
-        "-Wl,-rpath,/usr/lib/nvidia-384",
-    ],
-    deps = [":linker_search_path"],
-)
-
-cc_library(
-    name = "cupti_headers",
-    hdrs = [
-        ":cuda-extras",
-    ],
-    includes = [
-        ".",
-        "extras/CUPTI/include/",
-    ],
-)
-
-# cupti .so exposed at linktime
-cc_library(
-    name = "cupti_link",
-    linkopts = [
-        "-L/usr/local/cuda/extras/CUPTI/lib64",
-        "-lcupti",
-    ],
-)
-
-cc_library(
-    name = "libdevice_root",
-    data = [":cuda-nvvm"],
-)
-
-CUDA_INCLUDES_FILES = [
-    "include/builtin_types.h",
-    "include/channel_descriptor.h",
-    "include/CL/cl_egl.h",
-    "include/CL/cl_ext.h",
-    "include/CL/cl_gl_ext.h",
-    "include/CL/cl_gl.h",
-    "include/CL/cl.h",
-    "include/CL/cl.hpp",
-    "include/CL/cl_platform.h",
-    "include/CL/opencl.h",
-    "include/common_functions.h",
-    "include/cooperative_groups.h",
-    "include/cooperative_groups_helpers.h",
-    "include/crt/common_functions.h",
-    "include/crt/device_double_functions.h",
-    "include/crt/device_double_functions.hpp",
-    "include/crt/device_functions.h",
-    "include/crt/device_functions.hpp",
-    "include/crt/func_macro.h",
-    "include/crt/host_config.h",
-    "include/crt/host_defines.h",
-    "include/crt/host_runtime.h",
-    "include/crt/math_functions.h",
-    "include/crt/math_functions.hpp",
-    "include/crt/mma.h",
-    "include/crt/mma.hpp",
-    "include/crt/nvfunctional",
-    "include/crt/sm_70_rt.h",
-    "include/crt/sm_70_rt.hpp",
-    "include/crt/storage_class.h",
-    # TODO: figure out why on a CI machine with CUDA 10.2 it's not present
-    # "include/cublas_api.h",
-    # "include/cublas.h",
-    # "include/cublas_v2.h",
-    # "include/cublasXt.h",
-    "include/cuComplex.h",
-    "include/cuda_device_runtime_api.h",
-    "include/cudaEGL.h",
-    "include/cuda_egl_interop.h",
-    "include/cuda_fp16.h",
-    "include/cuda_fp16.hpp",
-    "include/cudaGL.h",
-    "include/cuda_gl_interop.h",
-    "include/cuda.h",
-    "include/cudalibxt.h",
-    "include/cuda_occupancy.h",
-    "include/cuda_profiler_api.h",
-    "include/cudaProfiler.h",
-    "include/cudart_platform.h",
-    "include/cuda_runtime_api.h",
-    "include/cuda_runtime.h",
-    "include/cuda_surface_types.h",
-    "include/cuda_texture_types.h",
-    "include/cudaVDPAU.h",
-    "include/cuda_vdpau_interop.h",
-    "include/cufft.h",
-    "include/cufftw.h",
-    "include/cufftXt.h",
-    "include/curand_discrete2.h",
-    "include/curand_discrete.h",
-    "include/curand_globals.h",
-    "include/curand.h",
-    "include/curand_kernel.h",
-    "include/curand_lognormal.h",
-    "include/curand_mrg32k3a.h",
-    "include/curand_mtgp32dc_p_11213.h",
-    "include/curand_mtgp32.h",
-    "include/curand_mtgp32_host.h",
-    "include/curand_mtgp32_kernel.h",
-    "include/curand_normal.h",
-    "include/curand_normal_static.h",
-    "include/curand_philox4x32_x.h",
-    "include/curand_poisson.h",
-    "include/curand_precalc.h",
-    "include/curand_uniform.h",
-    "include/cusolver_common.h",
-    "include/cusolverDn.h",
-    "include/cusolverRf.h",
-    "include/cusolverSp.h",
-    "include/cusolverSp_LOWLEVEL_PREVIEW.h",
-    "include/cusparse.h",
-    "include/cusparse_v2.h",
-    "include/device_atomic_functions.h",
-    "include/device_atomic_functions.hpp",
-    "include/device_double_functions.h",
-    "include/device_functions.h",
-    "include/device_launch_parameters.h",
-    "include/device_types.h",
-    "include/driver_functions.h",
-    "include/driver_types.h",
-    "include/fatBinaryCtl.h",
-    "include/fatbinary.h",
-    "include/host_config.h",
-    "include/host_defines.h",
-    "include/library_types.h",
-    "include/math_constants.h",
-    "include/math_functions.h",
-    "include/mma.h",
-    "include/nppcore.h",
-    "include/nppdefs.h",
-    "include/npp.h",
-    "include/nppi_arithmetic_and_logical_operations.h",
-    "include/nppi_color_conversion.h",
-    "include/nppi_compression_functions.h",
-    "include/nppi_computer_vision.h",
-    "include/nppi_data_exchange_and_initialization.h",
-    "include/nppi_filtering_functions.h",
-    "include/nppi_geometry_transforms.h",
-    "include/nppi.h",
-    "include/nppi_linear_transforms.h",
-    "include/nppi_morphological_operations.h",
-    "include/nppi_statistics_functions.h",
-    "include/nppi_support_functions.h",
-    "include/nppi_threshold_and_compare_operations.h",
-    "include/npps_arithmetic_and_logical_operations.h",
-    "include/npps_conversion_functions.h",
-    "include/npps_filtering_functions.h",
-    "include/npps.h",
-    "include/npps_initialization.h",
-    "include/npps_statistics_functions.h",
-    "include/npps_support_functions.h",
-    # Note: CUDA 10.0 only
-    # "include/nppversion.h",
-    # TODO: figure out why on a CI machine with CUDA 10.2 it's not present
-    # "include/nvblas.h",
-    "include/nvfunctional",
-    "include/nvgraph.h",
-    "include/nvjpeg.h",
-    "include/nvml.h",
-    "include/nvrtc.h",
-    "include/nvToolsExtCuda.h",
-    "include/nvToolsExtCudaRt.h",
-    "include/nvToolsExt.h",
-    "include/nvToolsExtMeta.h",
-    "include/nvToolsExtSync.h",
-    "include/nvtx3/nvToolsExtCuda.h",
-    "include/nvtx3/nvToolsExtCudaRt.h",
-    "include/nvtx3/nvToolsExt.h",
-    "include/nvtx3/nvToolsExtOpenCL.h",
-    "include/nvtx3/nvToolsExtSync.h",
-    "include/nvtx3/nvtxDetail/nvtxImplCore.h",
-    "include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h",
-    "include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h",
-    "include/nvtx3/nvtxDetail/nvtxImpl.h",
-    "include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h",
-    "include/nvtx3/nvtxDetail/nvtxImplSync_v3.h",
-    "include/nvtx3/nvtxDetail/nvtxInitDecls.h",
-    "include/nvtx3/nvtxDetail/nvtxInitDefs.h",
-    "include/nvtx3/nvtxDetail/nvtxInit.h",
-    "include/nvtx3/nvtxDetail/nvtxLinkOnce.h",
-    "include/nvtx3/nvtxDetail/nvtxTypes.h",
-    "include/sm_20_atomic_functions.h",
-    "include/sm_20_atomic_functions.hpp",
-    "include/sm_20_intrinsics.h",
-    "include/sm_20_intrinsics.hpp",
-    "include/sm_30_intrinsics.h",
-    "include/sm_30_intrinsics.hpp",
-    "include/sm_32_atomic_functions.h",
-    "include/sm_32_atomic_functions.hpp",
-    "include/sm_32_intrinsics.h",
-    "include/sm_32_intrinsics.hpp",
-    "include/sm_35_atomic_functions.h",
-    "include/sm_35_intrinsics.h",
-    "include/sm_60_atomic_functions.h",
-    "include/sm_60_atomic_functions.hpp",
-    "include/sm_61_intrinsics.h",
-    "include/sm_61_intrinsics.hpp",
-    # CUDA 10.0 only
-    # "include/sobol_direction_vectors.h",
-    "include/surface_functions.h",
-    "include/surface_functions.hpp",
-    "include/surface_indirect_functions.h",
-    "include/surface_indirect_functions.hpp",
-    "include/surface_types.h",
-    "include/texture_fetch_functions.h",
-    "include/texture_fetch_functions.hpp",
-    "include/texture_indirect_functions.h",
-    "include/texture_indirect_functions.hpp",
-    "include/texture_types.h",
-    "include/vector_functions.h",
-    "include/vector_functions.hpp",
-    "include/vector_types.h",
-]
-
-genrule(
-    name = "cuda-include",
-    outs = CUDA_INCLUDES_FILES,
-    cmd = " && ".join([
-        "ln -s /usr/local/cuda/{relpath} $(@D)/{relpath}".format(relpath = p)
-        for p in CUDA_INCLUDES_FILES
-    ]),
-    local = True,
-    tags = ["no-cache"],
-)
-
-CUDA_NVVM_FILES = [
-    "nvvm/bin/cicc",
-    "nvvm/include/nvvm.h",
-    "nvvm/lib64/libnvvm.so",
-    "nvvm/lib64/libnvvm.so.3",
-    "nvvm/lib64/libnvvm.so.3.3.0",
-    "nvvm/libdevice/libdevice.10.bc",
-]
-
-genrule(
-    name = "cuda-nvvm",
-    outs = CUDA_NVVM_FILES,
-    cmd = " && ".join([
-        "ln -s /usr/local/cuda/{relpath} $(@D)/{relpath}".format(relpath = p)
-        for p in CUDA_NVVM_FILES
-    ]),
-    local = True,
-    tags = ["no-cache"],
-)
-
-CUDA_EXTRAS_FILES = [
-    "extras/CUPTI/include/cuda_stdint.h",
-    "extras/CUPTI/include/cupti.h",
-    "extras/CUPTI/include/cupti_activity.h",
-    "extras/CUPTI/include/cupti_callbacks.h",
-    "extras/CUPTI/include/cupti_driver_cbid.h",
-    "extras/CUPTI/include/cupti_events.h",
-    "extras/CUPTI/include/cupti_metrics.h",
-    "extras/CUPTI/include/cupti_nvtx_cbid.h",
-    "extras/CUPTI/include/cupti_result.h",
-    "extras/CUPTI/include/cupti_runtime_cbid.h",
-    "extras/CUPTI/include/cupti_version.h",
-    "extras/CUPTI/include/generated_cuda_gl_interop_meta.h",
-    "extras/CUPTI/include/generated_cuda_meta.h",
-    "extras/CUPTI/include/generated_cuda_runtime_api_meta.h",
-    "extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h",
-    "extras/CUPTI/include/generated_cudaGL_meta.h",
-    "extras/CUPTI/include/generated_cudaVDPAU_meta.h",
-    "extras/CUPTI/include/generated_nvtx_meta.h",
-    "extras/CUPTI/include/GL/gl.h",
-    "extras/CUPTI/include/GL/glew.h",
-    "extras/CUPTI/include/GL/glext.h",
-    "extras/CUPTI/include/GL/glu.h",
-    "extras/CUPTI/include/GL/glut.h",
-    "extras/CUPTI/include/GL/glx.h",
-    "extras/CUPTI/include/GL/glxext.h",
-    "extras/CUPTI/include/GL/wglew.h",
-    "extras/CUPTI/include/GL/wglext.h",
-    "extras/CUPTI/include/openacc/cupti_openacc.h",
-]
-
-genrule(
-    name = "cuda-extras",
-    outs = CUDA_EXTRAS_FILES,
-    cmd = " && ".join([
-        "ln -s /usr/local/cuda/{relpath} $(@D)/{relpath}".format(relpath = p)
-        for p in CUDA_EXTRAS_FILES
-    ]),
-    local = True,
-    tags = ["no-cache"],
-)
-
-genrule(
-    name = "cudnn-include",
-    outs = [
-        "include/cudnn.h",
-    ],
-    cmd = """
-        ln -s /usr/include/cudnn.h $(@D)/cudnn.h""",
-    local = True,
-    tags = ["no-cache"],
-)
-
diff --git a/third_party/tensorpipe.BUILD b/third_party/tensorpipe.BUILD
index ae210f473933..66c6a795162a 100644
--- a/third_party/tensorpipe.BUILD
+++ b/third_party/tensorpipe.BUILD
@@ -162,8 +162,8 @@ cc_library(
 
 cc_library(
     name = "tensorpipe_cuda",
-    srcs = TENSORPIPE_CUDA_SOURCES,
-    hdrs = TENSORPIPE_CUDA_HEADERS + [":tensorpipe_cuda_config_header"],
+    srcs = glob(TENSORPIPE_CUDA_SOURCES),
+    hdrs = glob(TENSORPIPE_CUDA_HEADERS) + [":tensorpipe_cuda_config_header"],
     includes = [
         ".",
     ],
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index afdc9cb3d6a4..bdfc20dfbc89 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2342,22 +2342,13 @@
 # miopen
 
 - name: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  self, weight, bias: "grad.defined() ? miopen_convolution_transpose_backward(self, grad, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
-
-- name: miopen_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, true, output_padding, groups, benchmark, deterministic, true, false, grad_input_mask)
+  self, weight, bias: "grad.defined() ? convolution_backward(grad, self, weight, bias->sizes(), stride, padding, dilation, true, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  self, weight, bias: "grad.defined() ? miopen_convolution_backward(self, grad, weight, padding, stride, dilation, groups, benchmark, deterministic, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
-
-- name: miopen_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), groups, benchmark, deterministic, true, false, grad_input_mask)
+  self, weight, bias: "grad.defined() ? convolution_backward(grad, self, weight, bias->sizes(), stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  self, weight, bias: "grad.defined() ? miopen_depthwise_convolution_backward(self, grad, weight, padding, stride, dilation, groups, benchmark, deterministic, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
-
-- name: miopen_depthwise_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), groups, benchmark, deterministic, true, false, grad_input_mask)
+  self, weight, bias: "grad.defined() ? convolution_backward(grad, self, weight, bias->sizes(), stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor)
   input, weight, bias: "grad.defined() ? (training ? miopen_batch_norm_backward(input, grad.contiguous(), weight, running_mean, running_var, result1, result2, epsilon) : native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, epsilon, grad_input_mask)) : std::tuple<Tensor, Tensor, Tensor>()"
diff --git a/tools/codegen/utils.py b/tools/codegen/utils.py
index 406ae0d2ce89..3c3e3a42c768 100644
--- a/tools/codegen/utils.py
+++ b/tools/codegen/utils.py
@@ -1,11 +1,11 @@
-import re
+import contextlib
+import functools
+import hashlib
 import os
+import re
+import textwrap
 from typing import Tuple, List, Iterable, Iterator, Callable, Sequence, TypeVar, Optional, Dict, Any, Union, Set, NoReturn
 from enum import Enum
-import contextlib
-import textwrap
-import hashlib
-import functools
 
 from tools.codegen.code_template import CodeTemplate
 
@@ -139,6 +139,8 @@ class FileManager:
         except IOError:
             old_contents = None
         if contents != old_contents:
+            # Create output directory if it doesn't exist
+            os.makedirs(os.path.dirname(filename), exist_ok=True)
             with open(filename, 'w') as f:
                 f.write(contents)
 
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index 332e784f0b06..339a9f99707e 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -1,8 +1,7 @@
+import argparse
 import collections
 from pprint import pformat
 
-import argparse
-
 from tools.codegen.model import Variant
 from tools.codegen.api.python import (PythonSignatureGroup,
                                       PythonSignatureNativeFunctionPair)
@@ -10,7 +9,7 @@ from tools.codegen.gen import parse_native_yaml
 from tools.codegen.utils import FileManager
 from typing import Sequence, List, Dict
 
-from ..autograd.gen_python_functions import should_generate_py_binding, load_signatures, group_overloads
+from tools.autograd.gen_python_functions import should_generate_py_binding, load_signatures, group_overloads
 
 """
 This module implements generation of type stubs for PyTorch,
diff --git a/tools/rules/cu.bzl b/tools/rules/cu.bzl
index 726fadfe98bd..aec4a874cf00 100644
--- a/tools/rules/cu.bzl
+++ b/tools/rules/cu.bzl
@@ -1,3 +1,6 @@
-# gpu support is not available
-def cu_library(**kwargs):
-    pass
+load("@rules_cuda//cuda:defs.bzl", "cuda_library")
+
+NVCC_COPTS = ["--expt-relaxed-constexpr", "--expt-extended-lambda"]
+
+def cu_library(name, srcs, copts = [], **kwargs):
+    cuda_library(name, srcs = srcs, copts = NVCC_COPTS + copts, **kwargs)
diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
index 686b5c4a34f4..ff175771fd18 100644
--- a/tools/setup_helpers/cmake.py
+++ b/tools/setup_helpers/cmake.py
@@ -118,20 +118,30 @@ class CMake:
         cmake_command = 'cmake'
         if IS_WINDOWS:
             return cmake_command
-        cmake3 = which('cmake3')
-        cmake = which('cmake')
-        if cmake3 is not None and CMake._get_version(cmake3) >= LooseVersion("3.10.0"):
-            cmake_command = 'cmake3'
-            return cmake_command
-        elif cmake is not None and CMake._get_version(cmake) >= LooseVersion("3.10.0"):
-            return cmake_command
-        else:
+        cmake3_version = CMake._get_version(which('cmake3'))
+        cmake_version = CMake._get_version(which('cmake'))
+
+        _cmake_min_version = LooseVersion("3.10.0")
+        if all((ver is None or ver < _cmake_min_version for ver in [cmake_version, cmake3_version])):
             raise RuntimeError('no cmake or cmake3 with version >= 3.10.0 found')
 
+        if cmake3_version is None:
+            cmake_command = 'cmake'
+        elif cmake_version is None:
+            cmake_command = 'cmake3'
+        else:
+            if cmake3_version >= cmake_version:
+                cmake_command = 'cmake3'
+            else:
+                cmake_command = 'cmake'
+        return cmake_command
+
     @staticmethod
-    def _get_version(cmd: str) -> Any:
+    def _get_version(cmd: Optional[str]) -> Any:
         "Returns cmake version."
 
+        if cmd is None:
+            return None
         for line in check_output([cmd, '--version']).decode('utf-8').split('\n'):
             if 'version' in line:
                 return LooseVersion(line.strip().split(' ')[2])
diff --git a/torch/ao/quantization/_dbr/auto_trace.py b/torch/ao/quantization/_dbr/auto_trace.py
index c37d8dc3715f..14d1d6a7b003 100644
--- a/torch/ao/quantization/_dbr/auto_trace.py
+++ b/torch/ao/quantization/_dbr/auto_trace.py
@@ -34,6 +34,7 @@ enable_logging = False
 
 def add_auto_observation(
     model : torch.nn.Module,
+    qconfig_dict: Dict[str, Any],
     example_inputs: Tuple[Any],
     input_dtypes: Any = (torch.float,),  # must be same structure as model inputs
     output_dtypes: Any = (torch.float,),  # must be same structure as model outputs
@@ -204,9 +205,11 @@ def add_auto_observation(
                             global_disable_torch_function_override
                         global_disable_torch_function_override = True
 
+                        # mypy ignore is used instead of assert because this
+                        # runs on every forward and assert has a performance cost
                         args, kwargs = parent_qstate.op_prepare_before_hook(
                             cur_module, args, kwargs, first_call, qtensor_id,
-                            fqn, cur_module)
+                            fqn, cur_module)  # type: ignore[arg-type]
 
                         # original forward
                         output = orig_module_call(self, *args, **kwargs)
@@ -263,27 +266,29 @@ def add_auto_observation(
                     # Create a list before iterating because we are adding new
                     # named modules inside the loop.
                     named_modules = list(self.named_modules())
-                    for k, v in named_modules:
+                    for fqn, v in named_modules:
 
-                        # k is the global FQN, i.e. 'foo.bar.baz'
+                        # fqn is the global FQN, i.e. 'foo.bar.baz'
                         # v is the module instance
                         #
                         # we need to associate the global FQN with SeenOp
                         # for modules, this is the module FQN
                         # for functions, this is the parent module FQN
-                        module_id_to_fqn[id(v)] = k
+                        module_id_to_fqn[id(v)] = fqn
 
-                        has_qconfig = hasattr(v, 'qconfig') and v.qconfig is not None
-                        if has_qconfig and not is_leaf(v):
-                            if v is self:
-                                # for the top level module only, specify input
-                                # and output dtypes
-                                v._auto_quant_state = AutoQuantizationState(
-                                    v.qconfig, input_dtypes, output_dtypes)
-                                pass
-                            else:
-                                v._auto_quant_state = AutoQuantizationState(
-                                    v.qconfig)
+                        if is_leaf(v):
+                            continue
+
+                        if v is self:
+                            # for the top level module only, specify input
+                            # and output dtypes
+                            v._auto_quant_state = AutoQuantizationState(
+                                qconfig_dict, fqn,
+                                input_dtypes, output_dtypes)
+                            pass
+                        else:
+                            v._auto_quant_state = AutoQuantizationState(
+                                qconfig_dict, fqn)
 
                 global_op_idx[0] = 0
 
diff --git a/torch/ao/quantization/_dbr/qconfig_dict_utils.py b/torch/ao/quantization/_dbr/qconfig_dict_utils.py
new file mode 100644
index 000000000000..68314a8fa5be
--- /dev/null
+++ b/torch/ao/quantization/_dbr/qconfig_dict_utils.py
@@ -0,0 +1,27 @@
+from typing import Dict, Any
+
+import torch
+
+TYPE_TO_REPLACEMENT_TYPE = {
+    torch.add: torch.Tensor.add,
+    torch.Tensor.add_: torch.Tensor.add,
+    torch.mul: torch.Tensor.mul,
+    torch.Tensor.mul_: torch.Tensor.mul,
+}
+
+def normalize_object_types(qconfig_dict: Dict[str, Any]) -> None:
+    """
+    This function looks for entries in `qconfig_dict['object_type']`
+    corresponding to PyTorch overrides of Python math functions
+    such as `torch.add` and `torch.mul`. If any of these functions are found,
+    it changes the type to the tensor variant of these functions.
+    This is needed because the tensor variant is what is expected
+    within the framework.
+    """
+    if 'object_type' not in qconfig_dict:
+        return
+
+    for idx, (target_type, qconfig) in enumerate(qconfig_dict['object_type']):
+        replacement_type = TYPE_TO_REPLACEMENT_TYPE.get(target_type, None)
+        if replacement_type is not None:
+            qconfig_dict['object_type'][idx] = (replacement_type, qconfig)
diff --git a/torch/ao/quantization/_dbr/quantization_state.py b/torch/ao/quantization/_dbr/quantization_state.py
index 9515cb921571..a1b14af761f8 100644
--- a/torch/ao/quantization/_dbr/quantization_state.py
+++ b/torch/ao/quantization/_dbr/quantization_state.py
@@ -31,6 +31,7 @@ from .utils import (
     get_producer_of_seen_op_info,
     clone_detach_tensor_without_dispatch,
     get_input_args_quant_dequant_info,
+    get_cur_qconfig,
 )
 
 OpConvertInfo = Tuple[
@@ -64,16 +65,15 @@ class AutoQuantizationState(torch.nn.Module):
 
     def __init__(
         self,
-        qconfig,
+        qconfig_dict: Dict[str, Any],
+        fqn: str,
         input_dtypes: Any = None,
         output_dtypes: Any = None,
     ):
         super().__init__()
         self.idx = 0
-        # TODO(future PR): change this to the subset of qconfig_dict
-        # relevant to the parent module
-        assert qconfig is not None
-        self.qconfig = qconfig
+        self.qconfig_dict = qconfig_dict
+        self.fqn = fqn
         # this is a ModuleDict in order to properly register observers
         # to be within the module hierarchy.
         self.tensor_id_to_observer = torch.nn.ModuleDict()
@@ -247,7 +247,7 @@ class AutoQuantizationState(torch.nn.Module):
         kwargs: Dict[str, Any],
         first_call: bool,
         qtensor_id: List[int],
-        fqn: Optional[str],
+        fqn: str,
         root_module: torch.nn.Module,
     ) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
         """
@@ -613,6 +613,7 @@ class AutoQuantizationState(torch.nn.Module):
         arg_tensor_infos: List[Optional[QTensorInfo]],
         func_output_dtype_type: FuncOutputDTypeType,
         qtensor_id: List[int],
+        fqn: str,
     ) -> None:
         """
         Runs the prepare hook during first_call for individual
@@ -642,13 +643,17 @@ class AutoQuantizationState(torch.nn.Module):
             # which will be converted to a quant later
             # TODO(future PR): share these observers if multiple ops need
             # this quant.
-            # TODO(future PR): create from qconfig of op instead of global
-            # qconfig.
-            if arg._qtensor_info.inf_dtype != torch.quint8:  # type: ignore[attr-defined]
+            qconfig = get_cur_qconfig(self.qconfig_dict, fqn, op)
+            if qconfig is None:
+                # If qconfig is None, we do not need any input observers
+                return
+            elif arg._qtensor_info.inf_dtype != torch.quint8:  # type: ignore[attr-defined]
+                # TODO(future PR): currently this only handles float32 and
+                # quint8, we need to extend it to other dtypes
                 tensor_id = arg._qtensor_info.id  # type: ignore[attr-defined]
                 weight_arg_idx = get_weight_arg_idx(op)
-                obs = self.qconfig.weight() if arg_idx == weight_arg_idx else \
-                    self.qconfig.activation()
+                obs = qconfig.weight() if arg_idx == weight_arg_idx else \
+                    qconfig.activation()
                 self.tensor_id_to_observer[str(tensor_id)] = obs
 
     def _first_call_op_prepare_before_hook_create_subgraphs(
@@ -658,7 +663,7 @@ class AutoQuantizationState(torch.nn.Module):
         kwargs: Dict[str, Any],
         first_call: bool,
         qtensor_id: List[int],
-        fqn: Optional[str],
+        fqn: str,
         root_module: torch.nn.Module,
     ) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
         """
@@ -679,13 +684,13 @@ class AutoQuantizationState(torch.nn.Module):
                     self._first_call_op_prepare_before_hook_create_subgraphs_tensor(
                         op, inner_arg, arg_idx, input_observed_arg_idxs,
                         arg_tensor_infos, func_output_dtype_type,
-                        qtensor_id)
+                        qtensor_id, fqn)
                     arg_idx += 1
             else:
                 self._first_call_op_prepare_before_hook_create_subgraphs_tensor(
                     op, arg, arg_idx, input_observed_arg_idxs,
                     arg_tensor_infos, func_output_dtype_type,
-                    qtensor_id)
+                    qtensor_id, fqn)
                 arg_idx += 1
 
         packable_tensor_idx_to_name = {}
@@ -716,11 +721,12 @@ class AutoQuantizationState(torch.nn.Module):
         if self.idx not in self.idx_to_seen_op_infos:
             op_type_is_module = isinstance(op, torch.nn.Module)
             op_type = type(op) if op_type_is_module else op
+            qconfig = get_cur_qconfig(self.qconfig_dict, fqn, op)
             self.idx_to_seen_op_infos[self.idx] = SeenOpInfo(
                 self.idx, op_type, op_type_is_module, fqn, arg_tensor_infos, [],
                 packable_tensor_idx_to_name, packable_nontensor_idx_to_arg,
                 packable_tensor_kwarg_name_to_name,
-                op_packing_only_uses_module_attributes)
+                op_packing_only_uses_module_attributes, qconfig)
 
         return args, kwargs
 
@@ -742,8 +748,11 @@ class AutoQuantizationState(torch.nn.Module):
         does not exist in the "before" hook.
         """
         if func_output_obs_type == FuncOutputObsType.NEW_OBS:
+            # TODO(future PR): check qconfig is None
+            qconfig = get_cur_qconfig(self.qconfig_dict, seen_op_info.fqn, op)
+            assert qconfig is not None
             self.tensor_id_to_observer[str(qtensor_id[0])] = \
-                self.qconfig.activation()
+                qconfig.activation()
         elif func_output_obs_type == FuncOutputObsType.REUSES_FIRST_INPUT_OBS:
             first_input_tensor_id = seen_op_info.input_tensor_infos[0].id
 
@@ -777,7 +786,10 @@ class AutoQuantizationState(torch.nn.Module):
                 if first_input_mod and hasattr(first_input_mod, 'activation_post_process'):
                     first_input_obs = first_input_mod.activation_post_process
                 else:
-                    first_input_obs = self.qconfig.activation()
+                    # TODO(future PR): check qconfig is None
+                    qconfig = get_cur_qconfig(self.qconfig_dict, seen_op_info.fqn, op)
+                    assert qconfig is not None
+                    first_input_obs = qconfig.activation()
 
             self.tensor_id_to_observer[str(qtensor_id[0])] = first_input_obs
 
@@ -788,10 +800,24 @@ class AutoQuantizationState(torch.nn.Module):
         func_output_dtype_type = get_func_output_dtype_type(
             op, args, seen_op_info.op_packing_only_uses_module_attributes)
         if func_output_dtype_type == FuncOutputDTypeType.DTYPE_DEPENDS_ON_QCONFIG:
-            dtype_to_use = torch.quint8
+            if isinstance(op, torch.nn.Module):
+                # For now, assume that eager mode convert has attached qconfig
+                # objects to any leaf module which needs quantization
+                if hasattr(op, 'activation_post_process'):
+                    dtype_to_use = op.activation_post_process.dtype
+                else:
+                    dtype_to_use = torch.float
+            else:
+                qconfig = get_cur_qconfig(self.qconfig_dict, seen_op_info.fqn, op)
+                if qconfig is None:
+                    dtype_to_use = torch.float
+                else:
+                    dtype_to_use = qconfig.activation().dtype
+
         elif func_output_dtype_type == FuncOutputDTypeType.DTYPE_DEFAULT_BC_UNSUPPORTED_SYNTAX:
             dtype_to_use = torch.float
         else:
+            # TODO(future PR): respect qconfig for torch.cat
             if isinstance(args[0], (tuple, list)):  # for torch.cat
                 unique_arg_dtypes = [
                     arg._qtensor_info.inf_dtype for arg in args[0]]
diff --git a/torch/ao/quantization/_dbr/utils.py b/torch/ao/quantization/_dbr/utils.py
index 13f31a314b8e..39e41e63b362 100644
--- a/torch/ao/quantization/_dbr/utils.py
+++ b/torch/ao/quantization/_dbr/utils.py
@@ -15,11 +15,17 @@ from .mappings import (
     add_and_mul_ops,
 )
 
+from ..qconfig import QConfigAny
+
 from torch.quantization import (
     ObserverBase,
     FakeQuantizeBase,
 )
 
+from ..qconfig_dict_utils import (
+    maybe_adjust_qconfig_for_module_type_or_name,
+)
+
 def _raise_obs_not_found_error(func):
     raise RuntimeError(
         f'Encountered arithmetic operation {torch.typename(func)} but we have '
@@ -75,6 +81,8 @@ SeenOpInfo = collections.namedtuple(
         # This is False if some packable args are results of other functions.
         # bool
         'op_packing_only_uses_module_attributes',
+        # QConfig for the op, can be None
+        'qconfig',
     ],
 )
 def seen_op_info_repr(self) -> str:
@@ -113,6 +121,7 @@ class ObserverWrapper(torch.nn.Identity):
     def __init__(self, child):
         super().__init__()
         self.child = child
+        self.dtype = child.dtype
 
 def wrap_observers_in_placeholders(module: torch.nn.Module) -> None:
     """
@@ -182,6 +191,9 @@ def get_func_output_obs_type(
     if is_module:
         return FuncOutputObsType.NONE
 
+    if seen_op_info.qconfig is None:
+        return FuncOutputObsType.NONE
+
     # check for ops which need packed weights but the weights are
     # coming from another function
     if not seen_op_info.op_packing_only_uses_module_attributes:
@@ -214,6 +226,8 @@ def converted_func_needs_scale_zp(seen_op_info: SeenOpInfo) -> bool:
     is_module = isinstance(op_type, type(torch.nn.Module))
     if is_module:
         return False
+    if seen_op_info.qconfig is None:
+        return False
     if op_type in add_and_mul_ops:
         # check if both arguments are tensors
         inputs = seen_op_info.input_tensor_infos
@@ -623,3 +637,24 @@ def get_input_args_quant_dequant_info(
             quant_infos.append(None)
             dequant_infos.append(False)
     return quant_infos, dequant_infos, any_arg_quant_or_dequant_needed
+
+def get_cur_qconfig(
+    qconfig_dict: Dict[str, Any],
+    cur_fqn: str,
+    cur_op: Callable,
+) -> Optional[QConfigAny]:
+    # precedence: global -> object_type -> module_name_regex -> module_name
+    #   -> module_name_object_type_order
+    # (module_name_regex, module_name_object_type_order not implemented yet)
+
+    # global
+    global_qconfig = qconfig_dict['']
+
+    # object_type
+    is_module = isinstance(cur_op, type(torch.nn.Module))
+    cur_op_type = type(cur_op) if is_module else cur_op
+
+    qconfig = maybe_adjust_qconfig_for_module_type_or_name(
+        qconfig_dict, cur_op_type, cur_fqn, global_qconfig)
+
+    return qconfig
diff --git a/torch/ao/quantization/_quantize_dbr.py b/torch/ao/quantization/_quantize_dbr.py
index f35fdf0cfb45..6e96427c965c 100644
--- a/torch/ao/quantization/_quantize_dbr.py
+++ b/torch/ao/quantization/_quantize_dbr.py
@@ -2,27 +2,46 @@ import torch
 
 from ._dbr.auto_trace import add_auto_observation, add_auto_convert
 from ._dbr.fusion import get_module_fusion_fqns
+from ._dbr.qconfig_dict_utils import normalize_object_types
+
+from .qconfig_dict_utils import (
+    get_flattened_qconfig_dict,
+    convert_dict_to_ordered_dict,
+)
 
 
-def prepare(model, example_inputs, inplace=False, allow_list=None,
+def prepare(model, qconfig_dict, example_inputs, inplace=False, allow_list=None,
             observer_non_leaf_module_list=None,
             prepare_custom_config_dict=None,
             fuse_modules=True):
     r"""A wrapper around `torch.quantization.prepare` which prepares the
-    model for quantization using dynamic tracing. Requires `example_inputs` to build
+    model for quantization using dynamic tracing.
+
+    Requires `qconfig_dict` (same format as prepare_fx) to specify the
+    quantization settings. Not all functionality is supported yet.
+
+    Requires `example_inputs` to build
     the graph before calibration or quantization aware training can proceed.
 
     TODO(future PR): better docblock
     """
     assert example_inputs is not None, 'example_inputs must be specified'
 
+    for qconfig_dict_option in ('module_name_regex', 'module_name_object_type_order'):
+        assert qconfig_dict_option not in qconfig_dict, \
+            f'{qconfig_dict_option} option of qconfig_dict is not ' + \
+            'implemented yet in define-by-run quantization'
+
+    normalize_object_types(qconfig_dict)
+    convert_dict_to_ordered_dict(qconfig_dict)
+    flattened_qconfig_dict = get_flattened_qconfig_dict(qconfig_dict)
+    torch.quantization.propagate_qconfig_(model, flattened_qconfig_dict)
+    # TODO(future PR): QAT support
+
     if fuse_modules:
         # automatically fuse modules
         old_class = model.__class__
-        # For now, need to propagate qconfig before observing, because
-        # AutoQuantizationState needs a qconfig to work
-        torch.quantization.propagate_qconfig_(model)
-        model = add_auto_observation(model, example_inputs)
+        model = add_auto_observation(model, qconfig_dict, example_inputs)
         module_fusion_fqns = get_module_fusion_fqns(model)
         if len(module_fusion_fqns):
             model = torch.quantization.fuse_modules(model, module_fusion_fqns)
@@ -55,7 +74,7 @@ def prepare(model, example_inputs, inplace=False, allow_list=None,
         model, inplace, allow_list, observer_non_leaf_module_list,
         prepare_custom_config_dict)
     assert not inplace
-    model = add_auto_observation(model, example_inputs)
+    model = add_auto_observation(model, qconfig_dict, example_inputs)
     return model
 
 
diff --git a/torch/ao/quantization/fx/__init__.py b/torch/ao/quantization/fx/__init__.py
index b374e083da23..08d613fae771 100644
--- a/torch/ao/quantization/fx/__init__.py
+++ b/torch/ao/quantization/fx/__init__.py
@@ -1,4 +1,4 @@
 from .prepare import prepare
 from .convert import convert
 from .fuse import Fuser
-from .backend_config_dict import get_tensorrt_backend_config_dict
+from .backend_config import get_tensorrt_backend_config_dict
diff --git a/torch/ao/quantization/fx/_convert_do_not_use.py b/torch/ao/quantization/fx/_convert_do_not_use.py
index 9631fdb435dd..0a5267b4442c 100644
--- a/torch/ao/quantization/fx/_convert_do_not_use.py
+++ b/torch/ao/quantization/fx/_convert_do_not_use.py
@@ -12,8 +12,9 @@ from ..utils import (
     activation_is_int8_quantized,
     weight_is_statically_quantized,
     get_qparam_dict,
+    _parent_name,
 )
-from .backend_config_dict.utils import get_quantized_reference_module_mapping
+from .backend_config.utils import get_quantized_reference_module_mapping
 
 from .graph_module import (
     QuantizedGraphModule,
@@ -23,7 +24,6 @@ from .utils import (
     get_custom_module_class_keys,
     get_quantize_node_info,
     create_getattr_from_value,
-    _parent_name,
 )
 
 from torch.ao.quantization.quantize import (
diff --git a/torch/ao/quantization/fx/_equalize.py b/torch/ao/quantization/fx/_equalize.py
index 546e41e5408c..bbebc628b580 100644
--- a/torch/ao/quantization/fx/_equalize.py
+++ b/torch/ao/quantization/fx/_equalize.py
@@ -9,14 +9,16 @@ from .utils import (
     WEIGHT_INDEX_DICT,
     get_new_attr_name_with_prefix,
     maybe_get_next_module,
-    _parent_name,
 )
 from ..observer import (
     PerChannelMinMaxObserver,
     _with_args,
     ObserverBase,
 )
-from ..utils import check_min_max_valid
+from ..utils import (
+    check_min_max_valid,
+    _parent_name,
+)
 
 from collections import namedtuple
 from typing import Dict, Any, List, Tuple, Optional
diff --git a/torch/ao/quantization/fx/_lower_to_native_backend.py b/torch/ao/quantization/fx/_lower_to_native_backend.py
index c8418bdc3dd0..79eda8747f91 100644
--- a/torch/ao/quantization/fx/_lower_to_native_backend.py
+++ b/torch/ao/quantization/fx/_lower_to_native_backend.py
@@ -5,7 +5,7 @@ from .graph_module import QuantizedGraphModule
 from .quantized_fusion_patterns_and_replacements import get_fbgemm_patterns_and_replacements
 from .match_utils import is_match
 from .match_utils import MatchAllNode
-from .utils import _parent_name
+from ..utils import _parent_name
 from typing import Dict, Type
 
 # Mapping from reference module class to the replacement quantized module class for lowering
diff --git a/torch/ao/quantization/fx/backend_config_dict/__init__.py b/torch/ao/quantization/fx/backend_config/__init__.py
similarity index 100%
rename from torch/ao/quantization/fx/backend_config_dict/__init__.py
rename to torch/ao/quantization/fx/backend_config/__init__.py
diff --git a/torch/ao/quantization/fx/backend_config_dict/fuse_handler.py b/torch/ao/quantization/fx/backend_config/fuse_handler.py
similarity index 100%
rename from torch/ao/quantization/fx/backend_config_dict/fuse_handler.py
rename to torch/ao/quantization/fx/backend_config/fuse_handler.py
diff --git a/torch/ao/quantization/fx/backend_config_dict/observation_type.py b/torch/ao/quantization/fx/backend_config/observation_type.py
similarity index 100%
rename from torch/ao/quantization/fx/backend_config_dict/observation_type.py
rename to torch/ao/quantization/fx/backend_config/observation_type.py
diff --git a/torch/ao/quantization/fx/backend_config_dict/quantize_handler.py b/torch/ao/quantization/fx/backend_config/quantize_handler.py
similarity index 100%
rename from torch/ao/quantization/fx/backend_config_dict/quantize_handler.py
rename to torch/ao/quantization/fx/backend_config/quantize_handler.py
diff --git a/torch/ao/quantization/fx/backend_config_dict/tensorrt.py b/torch/ao/quantization/fx/backend_config/tensorrt.py
similarity index 100%
rename from torch/ao/quantization/fx/backend_config_dict/tensorrt.py
rename to torch/ao/quantization/fx/backend_config/tensorrt.py
diff --git a/torch/ao/quantization/fx/backend_config_dict/utils.py b/torch/ao/quantization/fx/backend_config/utils.py
similarity index 100%
rename from torch/ao/quantization/fx/backend_config_dict/utils.py
rename to torch/ao/quantization/fx/backend_config/utils.py
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index d1a0546c7c49..2c7606eeb4c3 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -25,12 +25,14 @@ from .graph_module import (
 from .quantization_patterns import (
     QuantizeHandler,
 )
-from .qconfig_utils import (
+from ..qconfig_dict_utils import (
     convert_dict_to_ordered_dict,
+    update_qconfig_for_qat,
+)
+from .qconfig_utils import (
     generate_qconfig_map,
     compare_prepare_convert_qconfig_dict,
     update_qconfig_for_fusion,
-    update_qconfig_for_qat,
 )
 from ._equalize import update_obs_for_equalization, convert_eq_obs
 from .utils import (
diff --git a/torch/ao/quantization/fx/fuse.py b/torch/ao/quantization/fx/fuse.py
index 5da008271323..02a43f996801 100644
--- a/torch/ao/quantization/fx/fuse.py
+++ b/torch/ao/quantization/fx/fuse.py
@@ -15,8 +15,8 @@ from .pattern_utils import (
     get_default_fusion_patterns,
 )
 
-from .backend_config_dict.utils import get_fusion_pattern_to_fuse_handler_cls
-from .backend_config_dict.utils import get_fuser_method_mapping
+from .backend_config.utils import get_fusion_pattern_to_fuse_handler_cls
+from .backend_config.utils import get_fuser_method_mapping
 
 from .fusion_patterns import *  # noqa: F401,F403
 
diff --git a/torch/ao/quantization/fx/fusion_patterns.py b/torch/ao/quantization/fx/fusion_patterns.py
index d86c1cd4e590..8471325333af 100644
--- a/torch/ao/quantization/fx/fusion_patterns.py
+++ b/torch/ao/quantization/fx/fusion_patterns.py
@@ -3,7 +3,7 @@ from torch.fx.graph import Node
 from .pattern_utils import (
     register_fusion_pattern,
 )
-from .utils import _parent_name
+from ..utils import _parent_name
 from .quantization_types import QuantizerCls, NodePattern, Pattern
 from ..fuser_method_mappings import get_fuser_method_new
 from abc import ABC, abstractmethod
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index bc11fa666ae9..e94620cd9002 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -16,15 +16,16 @@ from ..quantize import (
 from ..observer import (
     ObserverBase,
 )
-from ..qconfig import QConfigAny
-from ..qconfig import is_reuse_input_qconfig
-from .qconfig_utils import (
-    convert_dict_to_ordered_dict,
-    generate_qconfig_map,
+from ..qconfig import QConfigAny, is_reuse_input_qconfig
+from ..qconfig_dict_utils import (
     get_flattened_qconfig_dict,
-    update_qconfig_for_fusion,
+    convert_dict_to_ordered_dict,
     update_qconfig_for_qat,
 )
+from .qconfig_utils import (
+    generate_qconfig_map,
+    update_qconfig_for_fusion,
+)
 
 from .quantization_patterns import (
     QuantizeHandler,
@@ -53,8 +54,8 @@ from .match_utils import (
     find_matches,
 )
 
+from ..utils import _parent_name
 from .utils import (
-    _parent_name,
     get_custom_module_class_keys,
     all_node_args_have_no_tensors,
     assert_and_get_unique_device,
@@ -82,7 +83,7 @@ from ..utils import (
     activation_is_int8_quantized,
 )
 
-from .backend_config_dict.utils import (
+from .backend_config.utils import (
     get_pattern_to_quantize_handlers,
     get_pattern_to_dtype_configs,
     get_pattern_to_input_type_to_index,
diff --git a/torch/ao/quantization/fx/qconfig_utils.py b/torch/ao/quantization/fx/qconfig_utils.py
index b8b754bf81e4..5738637456bc 100644
--- a/torch/ao/quantization/fx/qconfig_utils.py
+++ b/torch/ao/quantization/fx/qconfig_utils.py
@@ -1,11 +1,10 @@
 import torch
-from collections import OrderedDict, defaultdict
-from typing import Union, Callable, Any, Dict, Tuple, Set, Optional
+from collections import defaultdict
+from typing import Callable, Any, Dict, Tuple, Set, Optional
 from torch.ao.quantization.qconfig import add_module_to_qconfig_obs_ctr, QConfigAny, qconfig_equals
 from torch.ao.quantization.quantize import (
     is_activation_post_process,
 )
-import re
 from torch.fx import (
     GraphModule,
 )
@@ -13,88 +12,14 @@ from torch.fx.graph import (
     Graph,
 )
 
-from .utils import _parent_name
-from ..utils import (
-    get_combined_dict,
-)
+from ..utils import _parent_name
 from ..fuser_method_mappings import DEFAULT_OP_LIST_TO_FUSER_METHOD
-from ..quantization_mappings import (
-    get_default_qat_module_mappings,
+from ..qconfig_dict_utils import (
+    get_object_type_qconfig,
+    maybe_adjust_qconfig_for_module_type_or_name,
 )
 
 
-def get_flattened_qconfig_dict(qconfig_dict):
-    """ flatten the global, object_type and module_name qconfig
-    to the same qconfig_dict so that it can be used by
-    propagate_qconfig_ function.
-    "module_name_regex" is ignored for now since it's not supported
-    in propagate_qconfig_, but it can be fixed later.
-
-    For example:
-    Input: {
-      "": qconfig,
-      "object_type": [
-        (torch.add, qconfig)
-      ],
-      "module_name": [
-        ("conv", qconfig)
-      ]
-    }
-
-    Output: {
-      "": qconfig,
-      torch.add: qconfig,
-      "conv": qconfig
-    }
-    """
-    flattened = dict()
-    if '' in qconfig_dict:
-        flattened[''] = qconfig_dict['']
-
-    def flatten_key(key):
-        if key in qconfig_dict:
-            for (obj, qconfig) in qconfig_dict[key].items():
-                flattened[obj] = qconfig
-
-    flatten_key('object_type')
-    flatten_key('module_name')
-    return flattened
-
-
-def convert_dict_to_ordered_dict(qconfig_dict: Any) -> Dict[str, Dict[Any, Any]]:
-    """ Convert dict in qconfig_dict to ordered dict
-    """
-    # convert a qconfig list for a type to OrderedDict
-    def _convert_to_ordered_dict(key, qconfig_dict):
-        qconfig_dict[key] = OrderedDict(qconfig_dict.get(key, []))
-
-    _convert_to_ordered_dict('object_type', qconfig_dict)
-    _convert_to_ordered_dict('module_name_regex', qconfig_dict)
-    _convert_to_ordered_dict('module_name', qconfig_dict)
-    return qconfig_dict
-
-
-def get_object_type_qconfig(
-        qconfig_dict: Any,
-        object_type: Union[Callable, str],
-        fallback_qconfig: QConfigAny) -> QConfigAny:
-    # object_type can be
-    # 1. module type (call_module)
-    # 2. function (call_function)
-    # 3. string (call_method)
-    return qconfig_dict['object_type'].get(
-        object_type, fallback_qconfig)
-
-
-def get_module_name_regex_qconfig(qconfig_dict, module_name, fallback_qconfig):
-    for regex_pattern, qconfig in \
-            qconfig_dict['module_name_regex'].items():
-        if re.match(regex_pattern, module_name):
-            # first match wins
-            return qconfig
-    return fallback_qconfig
-
-
 def maybe_adjust_qconfig_for_module_name_object_type_order(
     qconfig_dict: Any,
     cur_module_path: str,
@@ -116,48 +41,6 @@ def maybe_adjust_qconfig_for_module_name_object_type_order(
     return fallback_qconfig
 
 
-def get_module_name_qconfig(qconfig_dict, module_name, fallback_qconfig):
-    if module_name == '':
-        # module name qconfig not found
-        return fallback_qconfig
-    if module_name in qconfig_dict['module_name']:
-        return qconfig_dict['module_name'][module_name]
-    else:
-        parent, _ = _parent_name(module_name)
-        return get_module_name_qconfig(qconfig_dict, parent, fallback_qconfig)
-
-# get qconfig for module_name,
-# fallback to module_name_regex_qconfig, module_type_qconfig,
-# global_qconfig if necessary
-
-
-def maybe_adjust_qconfig_for_module_type_or_name(qconfig_dict, module_type, module_name, global_qconfig):
-    module_type_qconfig = get_object_type_qconfig(
-        qconfig_dict, module_type, global_qconfig)
-    module_name_regex_qconfig = get_module_name_regex_qconfig(
-        qconfig_dict, module_name, module_type_qconfig)
-    module_name_qconfig = get_module_name_qconfig(
-        qconfig_dict, module_name, module_name_regex_qconfig)
-    return module_name_qconfig
-
-
-def update_qconfig_for_qat(
-    qconfig_dict: Any,
-    additional_qat_module_mapping: Dict[Callable, Callable]
-) -> Any:
-    """
-    Update the qconfig_dict to account for module swaps during QAT.
-    During QAT we perform a module swap on the nn.Module types to the corresponding nn.qat.modules types.
-    """
-    all_qat_mappings = get_combined_dict(
-        get_default_qat_module_mappings(), additional_qat_module_mapping)
-    object_type_dict = qconfig_dict.get("object_type", None)
-    new_object_type_dict = object_type_dict.copy()
-    for k, v in new_object_type_dict.items():
-        if k in all_qat_mappings:
-            object_type_dict[all_qat_mappings[k]] = v
-    return qconfig_dict
-
 def update_qconfig_for_fusion(
     model: GraphModule,
     qconfig_dict: Any,
diff --git a/torch/ao/quantization/fx/quantization_patterns.py b/torch/ao/quantization/fx/quantization_patterns.py
index 22218c7ae93c..8a9d419c6b6b 100644
--- a/torch/ao/quantization/fx/quantization_patterns.py
+++ b/torch/ao/quantization/fx/quantization_patterns.py
@@ -33,9 +33,8 @@ from .pattern_utils import (
     get_default_output_activation_post_process_map,
     Pattern,
 )
-
+from ..utils import _parent_name
 from .utils import (
-    _parent_name,
     all_node_args_have_no_tensors,
     quantize_node,
     get_per_tensor_qparams,
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index 6f49ba442b70..5e3594772c52 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -37,14 +37,6 @@ BIAS_INDEX_DICT = {
     torch.nn.functional.instance_norm : [4],
 }
 
-# turn foo.bar -> ['foo', 'bar']
-def _parent_name(target):
-    r = target.rsplit('.', 1)
-    if len(r) == 1:
-        return '', r[0]
-    else:
-        return r[0], r[1]
-
 def graph_pretty_str(g, shorten=True) -> str:
     """Returns a printable representation of the ops in the graph of g.
     If shorten is True, tries to abbreviate fields.
diff --git a/torch/ao/quantization/qconfig_dict_utils.py b/torch/ao/quantization/qconfig_dict_utils.py
new file mode 100644
index 000000000000..33a1e9a4d624
--- /dev/null
+++ b/torch/ao/quantization/qconfig_dict_utils.py
@@ -0,0 +1,126 @@
+from collections import OrderedDict
+import re
+from typing import Any, Dict, Callable, Union
+
+from .utils import (
+    get_combined_dict,
+    _parent_name,
+)
+from .quantization_mappings import (
+    get_default_qat_module_mappings,
+)
+from torch.ao.quantization.qconfig import QConfigAny
+
+
+def get_object_type_qconfig(
+        qconfig_dict: Any,
+        object_type: Union[Callable, str],
+        fallback_qconfig: QConfigAny) -> QConfigAny:
+    # object_type can be
+    # 1. module type (call_module)
+    # 2. function (call_function)
+    # 3. string (call_method)
+    return qconfig_dict['object_type'].get(
+        object_type, fallback_qconfig)
+
+
+def get_module_name_regex_qconfig(qconfig_dict, module_name, fallback_qconfig):
+    for regex_pattern, qconfig in \
+            qconfig_dict['module_name_regex'].items():
+        if re.match(regex_pattern, module_name):
+            # first match wins
+            return qconfig
+    return fallback_qconfig
+
+
+def get_module_name_qconfig(qconfig_dict, module_name, fallback_qconfig):
+    if module_name == '':
+        # module name qconfig not found
+        return fallback_qconfig
+    if module_name in qconfig_dict['module_name']:
+        return qconfig_dict['module_name'][module_name]
+    else:
+        parent, _ = _parent_name(module_name)
+        return get_module_name_qconfig(qconfig_dict, parent, fallback_qconfig)
+
+
+def maybe_adjust_qconfig_for_module_type_or_name(qconfig_dict, module_type, module_name, global_qconfig):
+    # get qconfig for module_name,
+    # fallback to module_name_regex_qconfig, module_type_qconfig,
+    # global_qconfig if necessary
+    module_type_qconfig = get_object_type_qconfig(
+        qconfig_dict, module_type, global_qconfig)
+    module_name_regex_qconfig = get_module_name_regex_qconfig(
+        qconfig_dict, module_name, module_type_qconfig)
+    module_name_qconfig = get_module_name_qconfig(
+        qconfig_dict, module_name, module_name_regex_qconfig)
+    return module_name_qconfig
+
+
+def get_flattened_qconfig_dict(qconfig_dict):
+    """ flatten the global, object_type and module_name qconfig
+    to the same qconfig_dict so that it can be used by
+    propagate_qconfig_ function.
+    "module_name_regex" is ignored for now since it's not supported
+    in propagate_qconfig_, but it can be fixed later.
+
+    For example:
+    Input: {
+      "": qconfig,
+      "object_type": [
+        (torch.add, qconfig)
+      ],
+      "module_name": [
+        ("conv", qconfig)
+      ]
+    }
+
+    Output: {
+      "": qconfig,
+      torch.add: qconfig,
+      "conv": qconfig
+    }
+    """
+    flattened = dict()
+    if '' in qconfig_dict:
+        flattened[''] = qconfig_dict['']
+
+    def flatten_key(key):
+        if key in qconfig_dict:
+            for (obj, qconfig) in qconfig_dict[key].items():
+                flattened[obj] = qconfig
+
+    flatten_key('object_type')
+    flatten_key('module_name')
+    return flattened
+
+
+def convert_dict_to_ordered_dict(qconfig_dict: Any) -> Dict[str, Dict[Any, Any]]:
+    """ Convert dict in qconfig_dict to ordered dict
+    """
+    # convert a qconfig list for a type to OrderedDict
+    def _convert_to_ordered_dict(key, qconfig_dict):
+        qconfig_dict[key] = OrderedDict(qconfig_dict.get(key, []))
+
+    _convert_to_ordered_dict('object_type', qconfig_dict)
+    _convert_to_ordered_dict('module_name_regex', qconfig_dict)
+    _convert_to_ordered_dict('module_name', qconfig_dict)
+    return qconfig_dict
+
+
+def update_qconfig_for_qat(
+    qconfig_dict: Any,
+    additional_qat_module_mapping: Dict[Callable, Callable]
+) -> Any:
+    """
+    Update the qconfig_dict to account for module swaps during QAT.
+    During QAT we perform a module swap on the nn.Module types to the corresponding nn.qat.modules types.
+    """
+    all_qat_mappings = get_combined_dict(
+        get_default_qat_module_mappings(), additional_qat_module_mapping)
+    object_type_dict = qconfig_dict.get("object_type", None)
+    new_object_type_dict = object_type_dict.copy()
+    for k, v in new_object_type_dict.items():
+        if k in all_qat_mappings:
+            object_type_dict[all_qat_mappings[k]] = v
+    return qconfig_dict
diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py
index 522df17e87bc..62a4df162b48 100644
--- a/torch/ao/quantization/utils.py
+++ b/torch/ao/quantization/utils.py
@@ -220,3 +220,14 @@ def calculate_qmin_qmax(quant_min: int, quant_max: int, has_customized_qrange: b
         else:
             quant_min, quant_max = 0, 15
     return quant_min, quant_max
+
+
+def _parent_name(target):
+    """
+    Turn 'foo.bar' into ['foo', 'bar']
+    """
+    r = target.rsplit('.', 1)
+    if len(r) == 1:
+        return '', r[0]
+    else:
+        return r[0], r[1]
diff --git a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
index 7876a1e9491d..0cd301e4348f 100644
--- a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
+++ b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
@@ -330,8 +330,8 @@ struct CudaGraphFuser {
     }
 
     if ((consumer->inputs().size() + consumer->outputs().size() +
-         producer->inputs().size() +
-         producer->outputs().size()) > subgraph_arg_limit_) {
+         producer->inputs().size() + producer->outputs().size()) >
+        subgraph_arg_limit_) {
       return at::nullopt;
     }
 
@@ -762,12 +762,14 @@ struct CudaGraphFuser {
           // fusing nodes sharing inputs, this could save memory bandwidth by
           // reducing number of tensor read.
           for (const auto& u : producer->uses()) {
-            // only merge nodes before consumer, since any sibling after consumer
-            // has already considered merging this consumer to them already.
+            // only merge nodes before consumer, since any sibling after
+            // consumer has already considered merging this consumer to them
+            // already.
             if (u.user->isBefore(consumer)) {
               auto fusion_group = tryFuse(consumer, u.user);
               if (fusion_group) {
-                return std::make_pair(fusion_group.value()->reverseIterator(), true);
+                return std::make_pair(
+                    fusion_group.value()->reverseIterator(), true);
               }
             }
           }
diff --git a/torch/csrc/jit/codegen/cuda/parser.cpp b/torch/csrc/jit/codegen/cuda/parser.cpp
index 5ca70fa64efe..a33b33895c5b 100644
--- a/torch/csrc/jit/codegen/cuda/parser.cpp
+++ b/torch/csrc/jit/codegen/cuda/parser.cpp
@@ -1077,7 +1077,8 @@ class IrParser {
             auto mask = castOp(input->getDataType().value(), comparison);
             auto out = mul(grad_output, mask);
 
-            value_map.emplace(node->output()->unique(), ValueHolder(out, format));
+            value_map.emplace(
+                node->output()->unique(), ValueHolder(out, format));
           },
           nullptr,
           nullptr);
@@ -1232,7 +1233,6 @@ class IrParser {
                   node->output(1)->unique(),
                   ValueHolder(TensorViewBuilder().build(), format));
             }
-
           },
           nullptr,
           nullptr);
@@ -1895,8 +1895,9 @@ class IrParser {
             } else {
               const auto half_to_float = constant_as<bool>(node->input(2));
               TORCH_INTERNAL_ASSERT(
-                half_to_float.has_value(), "Bool half_to_float is not valid");
-              auto input_tensor_type = node->input(0)->type()->cast<TensorType>();
+                  half_to_float.has_value(), "Bool half_to_float is not valid");
+              auto input_tensor_type =
+                  node->input(0)->type()->cast<TensorType>();
               if (half_to_float.value() &&
                   input_tensor_type->scalarType() != at::ScalarType::Half) {
                 return false;
@@ -2272,8 +2273,7 @@ class IrParser {
     }
 
     {
-      auto ptr_op = getOperatorForLiteral(
-          "aten::gelu(Tensor self) -> Tensor");
+      auto ptr_op = getOperatorForLiteral("aten::gelu(Tensor self) -> Tensor");
       REGISTER_PARSE_RULE(
           ptr_op,
           {
diff --git a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
index 929b83b801c8..ab54b7776fa0 100644
--- a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
+++ b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
@@ -34,15 +34,14 @@ static const std::string getTempPath() {
 static const std::string temp_dir = getTempPath();
 static const std::string so_template = temp_dir + "pytorch_fuserXXXXXX.dll";
 static const std::string cpp_template = temp_dir + "pytorch_fuserXXXXXX.cpp";
-static const std::string check_exists_string =
-    "where \"${program}\" > nul 2> nul";
+static const std::string check_exists_string = "where ${program} > nul 2> nul";
 static std::vector<std::wstring> env_list;
 constexpr int so_suffix_len = 4;
 constexpr int cpp_suffix_len = 4;
 #else
 static const std::string so_template = "/tmp/pytorch_fuserXXXXXX.so";
 static const std::string cpp_template = "/tmp/pytorch_fuserXXXXXX.cpp";
-static const std::string check_exists_string = "which '${program}' > /dev/null";
+static const std::string check_exists_string = "which ${program} > /dev/null";
 constexpr int so_suffix_len = 3;
 constexpr int cpp_suffix_len = 4;
 #endif
@@ -50,8 +49,10 @@ constexpr int cpp_suffix_len = 4;
 intptr_t run(const std::string& cmd);
 
 static bool programExists(const std::string& program) {
+  std::stringstream ss;
+  c10::printQuotedString(ss, program);
   TemplateEnv env;
-  env.s("program", program);
+  env.s("program", ss.str());
   std::string cmd = format(check_exists_string, env);
 #ifdef _MSC_VER
   return (run(cmd.c_str()) == 0);
@@ -188,6 +189,7 @@ struct CompilerConfig {
 #endif
 
     if (!programExists(cxx)) {
+      TORCH_WARN("Compiler passed via CXX envvar does not exist!");
       cxx = "";
     }
   }
@@ -205,7 +207,7 @@ struct CompilerConfig {
   const std::string openmp_flags = "-fopenmp";
 #endif
 // Set openmp to true only if PyTorch is compiled with OpenMP support
-// OpenMP is typically not availabel on MacOS platform
+// OpenMP is typically not available on MacOS platform
 #if defined(_OPENMP)
   bool openmp = true;
 #else
@@ -267,6 +269,7 @@ static void runCompiler(
     const std::string& cpp_file,
     const std::string& so_file) {
   auto& config = getConfig();
+  TORCH_CHECK(!config.cxx.empty(), "Failed to compile a fused CPU kernel: Compiler not found");
   TemplateEnv env;
   env.s("cxx", config.cxx);
   env.s("fopenmp", config.openmp ? config.openmp_flags : "");
diff --git a/torch/csrc/jit/tensorexpr/IRSpecification.md b/torch/csrc/jit/tensorexpr/IRSpecification.md
index 4fa08c8f47a7..d9c37a4dd15e 100644
--- a/torch/csrc/jit/tensorexpr/IRSpecification.md
+++ b/torch/csrc/jit/tensorexpr/IRSpecification.md
@@ -4,6 +4,7 @@ Stmt
 | Store(buf_ = Buf, indices = [Expr], value_ = Expr, mask_ = Expr)
 | Allocate(buf_ = Buf)
 | Free(buf_ = Buf)
+| PlacementAllocate(buf_ = Buf, buf_to_reuse_ = Buf)
 | Let(var_ = Var, val_ = Expr)
 | Cond(condition_ = Expr, true_stmt_ = Block, false_stmt_ = Block)
 | For(var_ = Var, start_ = Expr, stop_ = Expr, body_ = Block, loopOptions = LoopOptions)
diff --git a/torch/csrc/jit/tensorexpr/analysis.h b/torch/csrc/jit/tensorexpr/analysis.h
index 6f021448c25f..82e7b7f62afd 100644
--- a/torch/csrc/jit/tensorexpr/analysis.h
+++ b/torch/csrc/jit/tensorexpr/analysis.h
@@ -248,6 +248,87 @@ class ModifiesVarChecker : public IRVisitor {
   bool found_{false};
 };
 
+// Traverse the Block stmt to identify the live range of the specified buf. The
+// live range, indicated by a pair of integers, specifies the first and last
+// stmt in block stmts that access to the buf.
+class BufLiveRange : public IRVisitor {
+ public:
+  BufLiveRange(BufPtr b) : buf_(b) {}
+
+  static std::tuple<int32_t, int32_t> liveRange(StmtPtr s, BufPtr b) {
+    BlockPtr block = to<Block>(s);
+    // We Only analze buffer live ranges for block stmts.
+    if (!block) {
+      return std::make_tuple(0, 0);
+    }
+
+    BufLiveRange analyzer(b);
+    block->accept(&analyzer);
+    return analyzer.getLiveRange();
+  }
+
+ private:
+  std::tuple<int32_t, int32_t> getLiveRange() {
+    return std::make_tuple(begin_, end_);
+  }
+
+  bool hasBufReads(StmtPtr s) {
+    auto loads1 = NodeFinder<Load>::find(s);
+    for (auto l : loads1) {
+      if (l->buf() == buf_) {
+        return true;
+      }
+    }
+    auto loads2 = NodeFinder<ExternalCall>::find(s);
+    for (auto l : loads2) {
+      for (auto lb : l->buf_args()) {
+        if (lb == buf_) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  bool hasBufWrites(StmtPtr s) {
+    auto writes1 = NodeFinder<Store>::find(s);
+    for (auto w : writes1) {
+      if (w->buf() == buf_) {
+        return true;
+      }
+    }
+    auto writes2 = NodeFinder<ExternalCall>::find(s);
+    for (auto w : writes2) {
+      if (w->buf() == buf_) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void findAccAndUpdateLiveRange(StmtPtr s) {
+    bool has_reads = hasBufReads(s), has_writes = hasBufWrites(s);
+    if (has_reads || has_writes) {
+      if (begin_ == -1) {
+        begin_ = curr_index_;
+      };
+      end_ = curr_index_;
+    }
+  }
+
+  void visit(BlockPtr v) {
+    for (StmtPtr s : *v) {
+      curr_index_ += 1;
+      findAccAndUpdateLiveRange(s);
+    }
+  }
+
+  BufPtr buf_;
+  int32_t begin_ = -1;
+  int32_t end_ = -1;
+  int32_t curr_index_ = -1;
+};
+
 // A class that analyzes the given program relevant for Block backend
 // It creates a map of multi dim buffers and their flat verions
 class CreateBufferMap : public IRVisitor {
diff --git a/torch/csrc/jit/tensorexpr/codegen.cpp b/torch/csrc/jit/tensorexpr/codegen.cpp
index 0a1051f9dd5f..59786cb980c6 100644
--- a/torch/csrc/jit/tensorexpr/codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/codegen.cpp
@@ -1,3 +1,5 @@
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/codegen.h>
 
 #include <sstream>
@@ -80,6 +82,174 @@ void CodeGen::call_with_numel(void** args, int64_t numel) {
   TORCH_INTERNAL_ASSERT(
       false, "This codegen backend does not implement call_with_numel");
 }
+
+c10::optional<size_t> bufSize(BufPtr buf) {
+  size_t size = elementSize(buf->dtype().scalar_type()) * buf->dtype().lanes();
+  for (auto& d : buf->dims()) {
+    if (!d->isConstant()) {
+      return c10::nullopt;
+    }
+    size = size * (*intValue(d));
+  }
+  return size;
+}
+
+// This algorithm takes the list of intermediate buffers and their liveness
+// ranges, and returns the allocations of these buffers. A buffer 'A' can be
+// allocated in the memory (appears as a pair of 'A's in the allocation results)
+// or reuse another buffer such as 'B' (appears as ('A', 'B')). Specifically, we
+// linearly scan the intermediate buffers by the time they appear, and try to
+// assign it an existing non-occupied memory allocation. If there are no such
+// allocations available, we'll create memory for it. Once we are beyond the
+// liveness range of this buffer, we'll mark its corresponding memory allocation
+// as "up for grabs" for future reuse.
+std::vector<std::pair<BufPtr, BufPtr>> AllocBufsWithMemReuse(
+    const std::unordered_set<BufPtr>& bufs,
+    const std::unordered_map<BufPtr, std::tuple<int32_t, int32_t>>&
+        buf_ranges) {
+  // Sort buffers by the time they appear.
+  std::vector<BufPtr> bufs_sorted(bufs.begin(), bufs.end());
+  auto sorting_function_by_start_time = [&buf_ranges](
+                                            BufPtr b1, BufPtr b2) -> bool {
+    return std::get<0>(buf_ranges.at(b1)) < std::get<0>(buf_ranges.at(b2));
+  };
+  std::sort(
+      bufs_sorted.begin(), bufs_sorted.end(), sorting_function_by_start_time);
+
+  // Map intermediate buffers to the most recently used memory if any.
+  std::list<BufPtr> mem_up_for_grabs;
+  std::unordered_map<BufPtr, BufPtr> buf_mem_map;
+  std::vector<std::pair<BufPtr, BufPtr>> buf_allocs;
+
+  auto sorting_function_by_end_time = [&buf_ranges](
+                                          BufPtr b1, BufPtr b2) -> bool {
+    return std::get<1>(buf_ranges.at(b1)) < std::get<1>(buf_ranges.at(b2));
+  };
+  for (auto buf : bufs_sorted) {
+    // If the buf has dynamic shapes, we'll skip it (i.e., allocate memory for
+    // it, and there are no future reuses on its memory).
+    // TODO: reuse memory for bufs with dynamic shapes
+    if (!bufSize(buf)) {
+      buf_allocs.emplace_back(std::make_pair(buf, buf));
+      continue;
+    }
+
+    auto start = std::get<0>(buf_ranges.at(buf));
+    auto end = std::get<1>(buf_ranges.at(buf));
+
+    // Release memory for buffers whose liveness range ends before the creation
+    // time of this buf.
+    // TODO: optimize in-place opererations and copy operations
+    std::vector<BufPtr> buf_to_release;
+    for (auto& mapped : buf_mem_map) {
+      auto buf_mapped = mapped.first;
+      auto end_buf_mapped = std::get<1>(buf_ranges.at(buf_mapped));
+      if (end_buf_mapped < start) {
+        buf_to_release.push_back(buf_mapped);
+      }
+    }
+
+    // Sort the buffers in the order of used time so the head of the release
+    // list contains the most recently used buf.
+    std::sort(
+        buf_to_release.begin(),
+        buf_to_release.end(),
+        sorting_function_by_end_time);
+    for (auto& buf_rl : buf_to_release) {
+      mem_up_for_grabs.push_front(buf_mem_map.at(buf_rl));
+      buf_mem_map.erase(buf_rl);
+    }
+
+    bool allocated = false;
+    // Check whether there are free memories that this buf can reuse.
+    for (auto it = mem_up_for_grabs.begin(); it != mem_up_for_grabs.end();
+         it++) {
+      auto m = *it;
+      if (bufSize(m) >= bufSize(buf)) {
+        buf_mem_map[buf] = m;
+        buf_allocs.emplace_back(std::make_pair(buf, m));
+        allocated = true;
+        mem_up_for_grabs.erase(it);
+        break;
+      }
+    }
+
+    // If there are no memories to reuse, we'll have to allocate new memory for
+    // it.
+    if (!allocated) {
+      buf_mem_map[buf] = buf;
+      buf_allocs.emplace_back(std::make_pair(buf, buf));
+    }
+  }
+
+  return buf_allocs;
+}
+
+StmtPtr insertAllocFree(
+    std::vector<std::pair<BufPtr, BufPtr>>& buf_allocs,
+    StmtPtr stmt) {
+  BlockPtr b = to<Block>(stmt);
+  if (!b) {
+    b = alloc<Block>(std::vector<StmtPtr>({stmt}));
+  }
+
+  // Insert allocations and frees for temporary buffers at global scope.
+  for (auto rit = buf_allocs.rbegin(); rit != buf_allocs.rend(); ++rit) {
+    if (rit->first == rit->second) {
+      BufPtr buf = rit->first;
+      b->prepend_stmt(alloc<Allocate>(buf));
+      b->append_stmt(alloc<Free>(buf));
+    } else {
+      b->prepend_stmt(alloc<PlacementAllocate>(rit->first, rit->second));
+    }
+  }
+
+  return b;
+}
+
+// We allocate intermediate buffers by inserting Allocate/Free or
+// PlacementAllocate stmts. Allocate/Free stmts will allocate memory at runtime,
+// and PlacementAllocate stmt reuses the memory of one buffer for another
+// buffer. In current implementation, we use linear scan for memory reuses.
+// TODO: try more memory reuse algorithms and compare their memory efficiency.
+void CodeGen::allocIntermediateBufs() {
+  // Identify intermediate buffers that are not allocated yet.
+  auto bufs = NodeFinder<Buf>::find(stmt_);
+  std::unordered_set<BufPtr> bufs_allocated;
+  for (auto b : buffer_args_) {
+    bufs_allocated.insert(b.buf());
+  }
+  auto allocs = NodeFinder<Allocate>::find(stmt_);
+  for (auto a : allocs) {
+    bufs_allocated.insert(a->buf());
+  }
+
+  std::unordered_set<BufPtr> interm_bufs;
+  std::unordered_map<BufPtr, std::tuple<int32_t, int32_t>> interm_buf_ranges;
+  for (auto buf : bufs) {
+    if (!bufs_allocated.count(buf) && !interm_bufs.count(buf)) {
+      interm_bufs.insert(buf);
+
+      // Identify the access stmts to each unallocated intermeiate buffer.
+      auto range = BufLiveRange::liveRange(stmt_, buf);
+      interm_buf_ranges.emplace(buf, range);
+    }
+  }
+
+  // For each intermediate buffer, we reuse the memory of an old buffer whose
+  // liveness range does not overlap with the current buffer, or allocate memory
+  // if reusing buffer is impossible.
+  auto buf_allocs = AllocBufsWithMemReuse(interm_bufs, interm_buf_ranges);
+
+  // Insert memory allocation/mapping nodes.
+  if (buf_allocs.size() > 0) {
+    auto stmt_new = insertAllocFree(buf_allocs, stmt_);
+    set_stmt(stmt_new);
+  }
+
+  GRAPH_DEBUG("\nMemory Allocation:\n\n", *stmt(), "\n");
+}
+
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/codegen.h b/torch/csrc/jit/tensorexpr/codegen.h
index 5665f658bf7e..8b2041215fb8 100644
--- a/torch/csrc/jit/tensorexpr/codegen.h
+++ b/torch/csrc/jit/tensorexpr/codegen.h
@@ -30,7 +30,9 @@ class TORCH_API CodeGen {
       : stmt_(stmt),
         buffer_args_(std::move(buffer_args)),
         device_(device),
-        kernel_func_name_(std::move(kernel_func_name)) {}
+        kernel_func_name_(std::move(kernel_func_name)) {
+    allocIntermediateBufs();
+  }
 
   virtual ~CodeGen() = default;
 
@@ -99,6 +101,8 @@ class TORCH_API CodeGen {
     return kernel_func_name_;
   }
 
+  void allocIntermediateBufs();
+
  protected:
   static void* argToPtr(const BufferArg& bufferArg, const CallArg& callArg);
 
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index c8793fd06892..c867e16a34b7 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -105,6 +105,10 @@ void CudaAnalysis::visit(AllocatePtr v) {
   throw std::runtime_error("Global alloc not supported yet");
 }
 
+void CudaAnalysis::visit(PlacementAllocatePtr v) {
+  throw std::runtime_error("Memory reuse not supported yet");
+}
+
 void CudaAnalysis::visit(ForPtr v) {
   // Recurse first.
   v->body()->accept(this);
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.h b/torch/csrc/jit/tensorexpr/cuda_codegen.h
index 912f4e9a9949..30af7a42929b 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.h
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.h
@@ -54,6 +54,7 @@ class CudaAnalysis : public IRVisitor {
 
   void visit(AllocatePtr v) override;
   void visit(FreePtr v) override;
+  void visit(PlacementAllocatePtr v) override;
   void visit(ForPtr v) override;
 
   std::unordered_set<BufPtr> store_targets_;
diff --git a/torch/csrc/jit/tensorexpr/eval.cpp b/torch/csrc/jit/tensorexpr/eval.cpp
index 73178384571b..51c21f85401a 100644
--- a/torch/csrc/jit/tensorexpr/eval.cpp
+++ b/torch/csrc/jit/tensorexpr/eval.cpp
@@ -981,6 +981,10 @@ class SimpleIREvaluatorImpl : public IRVisitor {
     internal_buffers_.insert(std::make_pair(b, std::move(buffer)));
   }
 
+  void visit(PlacementAllocatePtr v) override {
+    buffer_mapping_[v->buf()] = buffer_mapping_[v->buf_to_reuse()];
+  }
+
   void visit(FreePtr v) override {
     BufPtr b = v->buf();
     GRAPH_DEBUG("FREE: buf=", v->buf()->name_hint());
diff --git a/torch/csrc/jit/tensorexpr/fwd_decls.h b/torch/csrc/jit/tensorexpr/fwd_decls.h
index 2054a56ff41c..f8efd30b510d 100644
--- a/torch/csrc/jit/tensorexpr/fwd_decls.h
+++ b/torch/csrc/jit/tensorexpr/fwd_decls.h
@@ -102,6 +102,7 @@ class Cond;
 class ExternalCall;
 class For;
 class Free;
+class PlacementAllocate;
 class SyncThreads;
 using AllocatePtr = NodePtr<Allocate>;
 using AtomicAddPtr = NodePtr<AtomicAdd>;
@@ -110,6 +111,7 @@ using CondPtr = NodePtr<Cond>;
 using ExternalCallPtr = NodePtr<ExternalCall>;
 using ForPtr = NodePtr<For>;
 using FreePtr = NodePtr<Free>;
+using PlacementAllocatePtr = NodePtr<PlacementAllocate>;
 using SyncThreadsPtr = NodePtr<SyncThreads>;
 
 #define IMM_DECLARE(Type, Name) \
diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.cpp b/torch/csrc/jit/tensorexpr/ir_mutator.cpp
index 44ef4fbccb9f..4fb8cd451d63 100644
--- a/torch/csrc/jit/tensorexpr/ir_mutator.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_mutator.cpp
@@ -497,6 +497,22 @@ StmtPtr IRMutator::mutate(FreePtr v) {
   return v;
 }
 
+StmtPtr IRMutator::mutate(PlacementAllocatePtr v) {
+  BufPtr buf = v->buf();
+  BufPtr buf_new = to<Buf>(buf->accept_mutator(this));
+  TORCH_INTERNAL_ASSERT(
+      buf_new, buildErrorMessage("IRMutator produced null for Buf."));
+  v->set_buf(buf_new);
+
+  BufPtr buf_to_reuse = v->buf_to_reuse();
+  BufPtr buf_to_reuse_new = to<Buf>(buf_to_reuse->accept_mutator(this));
+  TORCH_INTERNAL_ASSERT(
+      buf_to_reuse_new, buildErrorMessage("IRMutator produced null for Buf."));
+  v->set_buf_to_reuse(buf_to_reuse_new);
+
+  return v;
+}
+
 StmtPtr IRMutator::mutate(LetPtr v) {
   VarPtr var_old = v->var();
   VarPtr var_new = to<Var>(var_old->accept_mutator(this));
diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.h b/torch/csrc/jit/tensorexpr/ir_mutator.h
index 27f41185e75f..2d37d49ba60c 100644
--- a/torch/csrc/jit/tensorexpr/ir_mutator.h
+++ b/torch/csrc/jit/tensorexpr/ir_mutator.h
@@ -54,6 +54,7 @@ class TORCH_API IRMutator {
 
   virtual StmtPtr mutate(AllocatePtr v);
   virtual StmtPtr mutate(FreePtr v);
+  virtual StmtPtr mutate(PlacementAllocatePtr v);
   virtual StmtPtr mutate(LetPtr v);
   virtual StmtPtr mutate(CondPtr v);
 };
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp
index 53efe5b1ed2c..35d481cdd8d2 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp
@@ -483,6 +483,11 @@ void IRPrinter::visit(FreePtr v) {
   os() << "Free(" << *v->buffer_var() << ");";
 }
 
+void IRPrinter::visit(PlacementAllocatePtr v) {
+  os() << "Alias(" << *v->buf()->base_handle() << ","
+       << *v->buf_to_reuse()->base_handle() << ");";
+}
+
 void IRPrinter::visit(LetPtr v) {
   os() << dtypeToCppString(v->var()->dtype()) << " " << *v->var();
   os() << " = " << *v->value();
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.h b/torch/csrc/jit/tensorexpr/ir_printer.h
index db2ecc062ae7..c58012e8a1b8 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.h
+++ b/torch/csrc/jit/tensorexpr/ir_printer.h
@@ -61,6 +61,7 @@ class TORCH_API IRPrinter : public IRVisitor {
   void visit(BlockPtr v) override;
   void visit(AllocatePtr v) override;
   void visit(FreePtr v) override;
+  void visit(PlacementAllocatePtr v) override;
   void visit(LetPtr v) override;
 
   // A child class may have a difference rule for generating dtype
diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.cpp b/torch/csrc/jit/tensorexpr/ir_visitor.cpp
index 44db863b8205..649a51ee4577 100644
--- a/torch/csrc/jit/tensorexpr/ir_visitor.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_visitor.cpp
@@ -183,6 +183,11 @@ void IRVisitor::visit(FreePtr v) {
   v->buffer_var()->accept(this);
 }
 
+void IRVisitor::visit(PlacementAllocatePtr v) {
+  v->buf()->accept(this);
+  v->buf_to_reuse()->accept(this);
+}
+
 void IRVisitor::visit(LetPtr v) {
   v->var()->accept(this);
   v->value()->accept(this);
diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.h b/torch/csrc/jit/tensorexpr/ir_visitor.h
index 26101a2c8fde..2bb48088d89f 100644
--- a/torch/csrc/jit/tensorexpr/ir_visitor.h
+++ b/torch/csrc/jit/tensorexpr/ir_visitor.h
@@ -43,6 +43,7 @@ class TORCH_API IRVisitor {
   virtual void visit(IntrinsicsPtr v);
   virtual void visit(AllocatePtr v);
   virtual void visit(FreePtr v);
+  virtual void visit(PlacementAllocatePtr v);
   virtual void visit(LetPtr v);
   virtual void visit(CondPtr v);
   virtual void visit(TermPtr v);
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index ae049c39a0ac..0445636c5c4f 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -781,12 +781,11 @@ StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) {
 
   if (pre_alloc_) {
     auto interm_bufs = l.getIntermediateBufs();
-    interm_bufs = preAllocIntermediateBufs(interm_bufs);
-    l.prepareForCodegen(interm_bufs);
-  } else {
-    l.prepareForCodegen();
+    preAllocIntermediateBufs(interm_bufs);
   }
 
+  l.prepareForCodegen();
+
   GRAPH_DEBUG("after prepareForCodegen", *l.root_stmt());
   l.simplify();
   GRAPH_DEBUG("after simplification", *l.root_stmt());
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index 650d4c45c8f2..109cf80a55b1 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -290,6 +290,7 @@ class LLVMCodeGenImpl : public IRVisitor {
   void visit(IntrinsicsPtr v) override;
   void visit(AllocatePtr v) override;
   void visit(FreePtr v) override;
+  void visit(PlacementAllocatePtr v) override;
   void visit(LetPtr v) override;
   void visit(CondPtr v) override;
   void visit(ExternalCallPtr v) override;
@@ -355,9 +356,9 @@ LLVMCodeGen::LLVMCodeGen(
     c10::optional<std::string> triple,
     c10::optional<std::string> cpu,
     c10::optional<std::string> attrs)
-    : CodeGen(stmt, args, device, kernel_func_name),
-      impl_(std::make_unique<
-            LLVMCodeGenImpl>(stmt, args, device, dtype, triple, cpu, attrs)) {
+    : CodeGen(stmt, args, device, kernel_func_name) {
+  impl_ = std::make_unique<LLVMCodeGenImpl>(
+      this->stmt(), args, device, dtype, triple, cpu, attrs);
   callee_ = std::make_unique<LLVMCodeGenCallee>(
       impl_->releaseJIT(), (void*)impl_->getKernelAddress());
 }
@@ -2049,6 +2050,11 @@ void LLVMCodeGenImpl::visit(AllocatePtr v) {
   varToVal_[v->buffer_var()] = malloc;
 }
 
+void LLVMCodeGenImpl::visit(PlacementAllocatePtr v) {
+  llvm::Value* ptr = varToVal_.at(v->buf_to_reuse()->base_handle());
+  varToVal_[v->buf()->base_handle()] = ptr;
+}
+
 void LLVMCodeGenImpl::visit(FreePtr v) {
   value_ = llvm::ConstantInt::get(IntTy_, 0);
   llvm::Value* ptr = varToVal_.at(v->buffer_var());
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 428c145388a8..b6a333bf1e58 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -1133,36 +1133,6 @@ BlockPtr findLowestContainingBlock(const std::vector<BufLoadOrStoreUse>& uses) {
   return b;
 }
 
-StmtPtr LoopNest::insertAllocFree(
-    StmtPtr stmt,
-    const c10::optional<std::vector<BufPtr>>& interm_bufs /* = c10::nullopt*/) {
-  std::vector<BufPtr> intermediate_bufs;
-  if (interm_bufs) {
-    intermediate_bufs = *interm_bufs;
-  } else {
-    intermediate_bufs = getIntermediateBufs();
-  }
-
-  if (intermediate_bufs.size() == 0ULL) {
-    return stmt;
-  }
-
-  BlockPtr b = to<Block>(stmt);
-  if (!b) {
-    b = alloc<Block>(std::vector<StmtPtr>({stmt}));
-  }
-
-  std::unordered_map<BufPtr, std::vector<BufLoadOrStoreUse>> uses =
-      findLoadOrStoreUses(stmt);
-  // Insert allocations and frees for temporary buffers at global scope.
-  for (BufPtr buf : intermediate_bufs) {
-    b->prepend_stmt(alloc<Allocate>(buf));
-    b->append_stmt(alloc<Free>(buf));
-  }
-
-  return b;
-}
-
 class StmtDeleter : public IRMutator {
  public:
   StmtDeleter(const std::unordered_set<StmtPtr>& targets) : targets_(targets) {}
@@ -1219,16 +1189,12 @@ void LoopNest::eliminateDeadStores() {
   root_stmt_ = root_stmt_->accept_mutator(&deleter);
 }
 
-void LoopNest::prepareForCodegen(
-    const c10::optional<std::vector<BufPtr>>& interm_bufs /*= c10::nullopt*/) {
+void LoopNest::prepareForCodegen() {
   // Expand reduction ops.
   ReductionExpander reduceExpander;
   root_stmt_ = reduceExpander.expand(root_stmt_);
 
   root_stmt_ = FlattenIndexes(root_stmt_);
-
-  // Add allocs and frees for intermediate buffers at the global level.
-  root_stmt_ = insertAllocFree(root_stmt_, interm_bufs);
 }
 
 namespace {
diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h
index e7e29e699b18..3bf23abc1f84 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.h
+++ b/torch/csrc/jit/tensorexpr/loopnest.h
@@ -550,16 +550,7 @@ class TORCH_API LoopNest {
 
   void eliminateDeadStores();
 
-  // Make the stmt ready for codegen. The optional argument 'interm_bufs' allows
-  // users to specify intermediate buffers that need runtime allocation. In
-  // default, we will insert 'Alloc/Free' stmts to allocate all intermediate
-  // buffers at runtime but users may have pre-allocated some of them at compile
-  // time, and in that case the user can specify what buffers to insert
-  // 'Alloc/Free' stmts for using 'interm_bufs'.
-  // TODO: refactor function 'prepareForCodegen' to remove argument
-  // 'interm_bufs'.
-  void prepareForCodegen(
-      const c10::optional<std::vector<BufPtr>>& interm_bufs = c10::nullopt);
+  void prepareForCodegen();
 
   const std::unordered_set<BufPtr> getInputBufs() const;
   const std::unordered_set<BufPtr> getOutputBufs() const {
@@ -571,9 +562,6 @@ class TORCH_API LoopNest {
   void initialize(
       const std::vector<Tensor>& output_tensors,
       const std::vector<Tensor>& tensors_to_compute);
-  StmtPtr insertAllocFree(
-      StmtPtr stmt,
-      const c10::optional<std::vector<BufPtr>>& interm_bufs = c10::nullopt);
 
   StmtPtr root_stmt_;
 
diff --git a/torch/csrc/jit/tensorexpr/stmt.h b/torch/csrc/jit/tensorexpr/stmt.h
index 22efa47fd1b0..b39976443e48 100644
--- a/torch/csrc/jit/tensorexpr/stmt.h
+++ b/torch/csrc/jit/tensorexpr/stmt.h
@@ -388,6 +388,42 @@ class TORCH_API Allocate : public StmtNode<Allocate> {
   // TODO: add memory types.
 };
 
+// PlacementAllocate is a variation of the Allocate operator in NNC IR. It does
+// not allocate memory but reuse the memory of another buffer for the given
+// buffer.
+class TORCH_API PlacementAllocate : public StmtNode<PlacementAllocate> {
+ public:
+  static PlacementAllocatePtr make(
+      const BufHandle& buf_handle,
+      const BufHandle& buf_handle_to_reuse) {
+    return alloc<PlacementAllocate>(
+        buf_handle.node(), buf_handle_to_reuse.node());
+  }
+
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  BufPtr buf_to_reuse() const {
+    return buf_to_reuse_;
+  }
+
+  void set_buf(BufPtr buf) {
+    buf_ = buf;
+  }
+
+  void set_buf_to_reuse(BufPtr buf) {
+    buf_to_reuse_ = buf;
+  }
+
+  explicit PlacementAllocate(BufPtr buf, BufPtr buf_to_reuse)
+      : buf_(buf), buf_to_reuse_(buf_to_reuse) {}
+
+ private:
+  BufPtr buf_;
+  BufPtr buf_to_reuse_;
+};
+
 // Free the specific buffer. It is an error.
 class TORCH_API Free : public StmtNode<Free> {
  public:
diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
index e6c25bff72bd..09e26d0b7cce 100644
--- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
+++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
@@ -842,34 +842,19 @@ def acc_ops_sum(
     args: Tuple[Argument, ...],
     kwargs: Dict[str, Argument],
     name: str,
-) -> Union[TRTTensor, Sequence[TRTTensor]]:
-    input_val = kwargs["input"]
-    if not isinstance(input_val, TRTTensor):
-        raise RuntimeError(
-            f"sum received input {input_val} that is not part "
-            "of the TensorRT region!"
-        )
+) -> TRTTensor:
+    return add_reduce_layer(network, target, args, kwargs, trt.ReduceOperation.SUM, name)
 
-    # If dim is specified, then we are computing reduced sum over certain dimensions.
-    # Otherwise, we are dong summation over all elements, which is only supported in
-    # explicit batch dimension.
-    if "dim" not in kwargs:
-        assert (
-            not network.has_implicit_batch_dimension
-        ), "Do not support sum all the elements for implicit batch."
-        dim = range(0, len(input_val.shape))
-    else:
-        dim = kwargs["dim"]  # type: ignore[assignment]
 
-    keepdim = False if "keepdim" not in kwargs else kwargs["keepdim"]
-    layer = network.add_reduce(
-        input_val,
-        trt.ReduceOperation.SUM,
-        get_axes_for_reduce_op(dim, network.has_implicit_batch_dimension),
-        keepdim,
-    )
-    set_layer_name(layer, target, name)
-    return layer.get_output(0)
+@tensorrt_converter(acc_ops.mean)
+def acc_ops_mean(
+    network: TRTNetwork,
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> TRTTensor:
+    return add_reduce_layer(network, target, args, kwargs, trt.ReduceOperation.AVG, name)
 
 
 def add_acc_ops_full_reduce(network, target, args, kwargs, name, reduce_op):
diff --git a/torch/fx/experimental/fx2trt/converters/converter_utils.py b/torch/fx/experimental/fx2trt/converters/converter_utils.py
index 25f71575523e..3d87a57fca96 100644
--- a/torch/fx/experimental/fx2trt/converters/converter_utils.py
+++ b/torch/fx/experimental/fx2trt/converters/converter_utils.py
@@ -1,9 +1,9 @@
-from typing import Any, Tuple, Sequence, Union, List, Optional
+from typing import Any, Tuple, Sequence, Union, List, Optional, Dict
 
 import numpy as np
 import tensorrt as trt
 import torch
-from torch.fx.node import Target
+from torch.fx.node import Target, Argument
 from torch.fx.experimental.fx2trt.types import *  # noqa: F403
 from torch.fx.experimental.fx2trt.utils import torch_dtype_from_trt
 
@@ -450,6 +450,58 @@ def add_activation_layer(
     return layer.get_output(0)
 
 
+def add_reduce_layer(
+    network: TRTNetwork,
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    operation_type: trt.ActivationType,
+    name: str,
+) -> TRTTensor:
+    """
+    Add a TensorRT Reduce layer to `network`.
+
+    Args:
+        network (TRTNetwork): TensorRT network object.
+        target (Target): Target of fx node.
+        args (Tuple[Argument, ...]): Args of the fx node.
+        kwargs (Dict[str, Argument]): Kwargs of the fx node.
+        operation_type (trt.ElementWiseOperation): Type of the TensorRT activation
+            operation.
+        name (str): The name we want to assign to the created TensorRT layer.
+
+    Returns:
+        The output of TensorRT Reduce layer.
+    """
+    input_val = kwargs["input"]
+    if not isinstance(input_val, TRTTensor):
+        raise RuntimeError(
+            f"{name} received input {input_val} that is not part "
+            "of the TensorRT region!"
+        )
+
+    # If dim is specified, then the op is reducing over certain dimensions.
+    # Otherwise, it's reducing over all elements, which is only supported in
+    # explicit batch dimension.
+    if "dim" not in kwargs:
+        assert (
+            not network.has_implicit_batch_dimension
+        ), f"We don't support reduce({name}) over all the elements if batch dim is implicit."
+        dim = range(0, len(input_val.shape))
+    else:
+        dim = kwargs["dim"]  # type: ignore[assignment]
+
+    keepdim = False if "keepdim" not in kwargs else kwargs["keepdim"]
+    layer = network.add_reduce(
+        input_val,
+        operation_type,
+        get_axes_for_reduce_op(dim, network.has_implicit_batch_dimension),
+        keepdim,
+    )
+    set_layer_name(layer, target, name)
+    return layer.get_output(0)
+
+
 def get_dyn_range(scale, zero_point, dtype):
     """
     Get the dynamic range of a tensor based on its scale, zero_point and dtype.
diff --git a/torch/fx/experimental/fx_acc/acc_ops.py b/torch/fx/experimental/fx_acc/acc_ops.py
index 14662729e89e..416556db046c 100644
--- a/torch/fx/experimental/fx_acc/acc_ops.py
+++ b/torch/fx/experimental/fx_acc/acc_ops.py
@@ -30,7 +30,8 @@ def linear(*, input, weight, bias):
 
 @register_acc_op_properties(AccOpProperty.quantized)
 @register_acc_op
-def quantized_linear(*, input, weight, bias, acc_out_ty):
+def quantized_linear(*, input, weight, bias, acc_out_ty=None):
+    assert acc_out_ty is not None
     qparams = acc_utils.get_field_from_acc_out_ty(acc_out_ty, "qparams")
     return nn.quantized.functional.linear(
         input,
@@ -490,7 +491,8 @@ def hardswish_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
     ],
 )
 @register_acc_op
-def quantized_add(*, input, other, acc_out_ty):
+def quantized_add(*, input, other, acc_out_ty=None):
+    assert acc_out_ty is not None
     qparams = acc_utils.get_field_from_acc_out_ty(acc_out_ty, "qparams")
     return torch.ops.quantized.add(
         input,
@@ -515,7 +517,8 @@ def quantized_add(*, input, other, acc_out_ty):
     ],
 )
 @register_acc_op
-def quantized_mul(*, input, other, acc_out_ty):
+def quantized_mul(*, input, other, acc_out_ty=None):
+    assert acc_out_ty is not None
     qparams = acc_utils.get_field_from_acc_out_ty(acc_out_ty, "qparams")
     return torch.ops.quantized.mul(
         input,
@@ -542,7 +545,8 @@ def quantized_mul(*, input, other, acc_out_ty):
     ],
 )
 @register_acc_op
-def quantize_per_tensor(*, input, acc_out_ty):
+def quantize_per_tensor(*, input, acc_out_ty=None):
+    assert acc_out_ty is not None
     qparams = acc_utils.get_field_from_acc_out_ty(acc_out_ty, "qparams")
     dtype = acc_utils.get_field_from_acc_out_ty(acc_out_ty, "dtype")
     return torch.quantize_per_tensor(
@@ -568,7 +572,8 @@ def quantize_per_tensor(*, input, acc_out_ty):
     ],
 )
 @register_acc_op
-def quantize_per_channel(*, input, acc_out_ty):
+def quantize_per_channel(*, input, acc_out_ty=None):
+    assert acc_out_ty is not None
     qparams = acc_utils.get_field_from_acc_out_ty(acc_out_ty, "qparams")
     dtype = acc_utils.get_field_from_acc_out_ty(acc_out_ty, "dtype")
     return torch.quantize_per_channel(
@@ -590,13 +595,15 @@ def dequantize(*, input):
 
 @register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary, AccOpProperty.quantized)
 @register_acc_op
-def rescale_quantize_per_tensor(*, input, acc_out_ty):
+def rescale_quantize_per_tensor(*, input, acc_out_ty=None):
+    assert acc_out_ty is not None
     d = dequantize(input=input)
     return quantize_per_tensor(input=d, acc_out_ty=acc_out_ty)
 
 @register_acc_op_properties(AccOpProperty.unary, AccOpProperty.quantized)
 @register_acc_op
-def rescale_quantize_per_channel(*, input, acc_out_ty):
+def rescale_quantize_per_channel(*, input, acc_out_ty=None):
+    assert acc_out_ty is not None
     d = dequantize(input=input)
     return quantize_per_channel(input=d, acc_out_ty=acc_out_ty)
 
@@ -710,7 +717,25 @@ def torch_log1p_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node
         return log_node
 
 
+def reduce_op_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule, func) -> torch.fx.Node:
+    with node.graph.inserting_before(node):
+        kwargs = dict(node.kwargs)
+        if "dim" in kwargs and isinstance(kwargs["dim"], int):
+            kwargs["dim"] = (kwargs["dim"],)
+        new_node = node.graph.call_function(func, kwargs=kwargs)
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
 @register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def sum(*, input, dim=None, keepdim=False, dtype=None):
+    if dim is not None:
+        return torch.sum(**locals())
+    else:
+        return input.sum(dtype=dtype)
+
+
 @register_custom_acc_mapper_fn(
     op_and_target=("call_method", "sum"),
     arg_replacement_tuples=[
@@ -729,23 +754,39 @@ def torch_log1p_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node
         ("dtype", "dtype", this_arg_is_optional),
     ],
 )
-def add_sum_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node:
-    with node.graph.inserting_before(node):
-        sum_kwargs = dict(node.kwargs)
-        if "dim" in sum_kwargs and isinstance(sum_kwargs["dim"], int):
-            sum_kwargs["dim"] = (sum_kwargs["dim"],)
-        sum_node = node.graph.call_function(sum, kwargs=sum_kwargs)
-        sum_node.meta = node.meta.copy()
-        return sum_node
+def sum_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node:
+    return reduce_op_mapper(node, mod, sum)
 
 
 @register_acc_op_properties(AccOpProperty.unary)
 @register_acc_op
-def sum(*, input, dim=None, keepdim=False, dtype=None):
+def mean(*, input, dim=None, keepdim=False, dtype=None):
     if dim is not None:
-        return torch.sum(**locals())
+        return torch.mean(**locals())
     else:
-        return input.sum(dtype=dtype)
+        return input.mean(dtype=dtype)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "mean"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.mean),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+def mean_mapper(node, mod):
+    return reduce_op_mapper(node, mod, mean)
 
 
 @register_custom_acc_mapper_fn(
@@ -1374,7 +1415,8 @@ def custom_narrow_mapper(node: torch.fx.Node, mod: nn.Module) -> torch.fx.Node:
     kwargs_to_move_to_acc_out_ty=[("shape", "shape")],
 )
 @register_acc_op
-def reshape(*, input, acc_out_ty):
+def reshape(*, input, acc_out_ty=None):
+    assert acc_out_ty is not None
     return torch.reshape(
         input, tuple(acc_utils.get_field_from_acc_out_ty(acc_out_ty, "shape"))
     )
@@ -1415,8 +1457,8 @@ def custom_tensor_reshape_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.
 
 @register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
 @register_acc_op
-def to_dtype(input, acc_out_ty):
-    assert acc_out_ty is not None, "valid acc_out_ty needed"
+def to_dtype(input, acc_out_ty=None):
+    assert acc_out_ty is not None
     return input.to(dtype=acc_utils.get_field_from_acc_out_ty(acc_out_ty, "dtype"))
 
 
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index 5312de637742..87b4519638fb 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -603,9 +603,31 @@ class ParameterDict(Module):
     def __iter__(self) -> Iterator[str]:
         return iter(self._parameters.keys())
 
+    def __reversed__(self) -> Iterator[str]:
+        return reversed(list(self._parameters.keys()))
+
+    def copy(self) -> 'ParameterDict':
+        """Returns a copy of this :class:`~torch.nn.ParameterDict` instance.
+        """
+        return ParameterDict(self._parameters.copy())
+
     def __contains__(self, key: str) -> bool:
         return key in self._parameters
 
+    def setdefault(self, key: str, default: Optional['Parameter'] = None) -> 'Parameter':
+        """If key is in the ParameterDict, return its parameter.
+        If not, insert `key` with a parameter `default` and return `default`.
+        `default` defaults to `None`.
+
+        Args:
+            key (string): key to set default for
+            default (:class:`~torch.nn.Parameter`): the parameter set to the key
+        """
+        if key in self._parameters:
+            return self._parameters[key]
+        self[key] = default  # type: ignore[assignment]
+        return self._parameters[key]
+
     def clear(self) -> None:
         """Remove all items from the ParameterDict.
         """
@@ -621,6 +643,31 @@ class ParameterDict(Module):
         del self[key]
         return v
 
+    def popitem(self) -> Tuple[str, 'Parameter']:
+        """Remove and return the last inserted `(key, parameter)` pair
+        from the ParameterDict
+        """
+        return self._parameters.popitem()
+
+    def get(self, key: str, default: Optional['Parameter'] = None) -> 'Parameter | None':
+        r"""Return the parameter associated with key if present.
+        Otherwise return default if provided, None if not.
+
+        Args:
+            key (string): key to get from the ParameterDict
+            default (Parameter, optional): value to return if key not present
+        """
+        return self._parameters.get(key, default)
+
+    def fromkeys(self, keys: Iterable['str'], default: Optional['Parameter'] = None) -> 'ParameterDict':
+        r"""Return a new ParameterDict with the keys provided
+
+        Args:
+            keys (iterable, string): keys to make the new ParameterDict from
+            default (Parameter, optional): value to set for all keys
+        """
+        return ParameterDict(self._parameters.fromkeys(keys, default))  # type: ignore[arg-type]
+
     def keys(self) -> Iterable[str]:
         r"""Return an iterable of the ParameterDict keys.
         """
@@ -693,3 +740,17 @@ class ParameterDict(Module):
                       "on each GPU except the original one.")
 
         return super(ParameterDict, self)._replicate_for_data_parallel()
+
+    def __or__(self, other: 'ParameterDict') -> 'ParameterDict':
+        copy = self.copy()
+        copy.update(other._parameters)
+        return copy
+
+    def __ror__(self, other: 'ParameterDict') -> 'ParameterDict':
+        copy = other.copy()
+        copy.update(self._parameters)
+        return copy
+
+    def __ior__(self, other : 'ParameterDict') -> 'ParameterDict':
+        self.update(other._parameters)
+        return self
diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py
index 4b49b9d48cea..d891c667b70b 100644
--- a/torch/quantization/fx/utils.py
+++ b/torch/quantization/fx/utils.py
@@ -7,7 +7,6 @@ appropriate files under `torch/ao/quantization/fx/`, while adding an import stat
 here.
 """
 from torch.ao.quantization.fx.utils import (
-    _parent_name,
     graph_pretty_str,
     get_per_tensor_qparams,
     quantize_node,
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 48d3b030c81e..49c18c620267 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1275,11 +1275,8 @@ def sample_inputs_tensor_split(op_info, device, dtype, requires_grad, **kwargs):
         (3, -1),
     )
 
-    def generator():
-        for args in args_cases:
-            yield SampleInput(make_input((S, S, S)), args=args)
-
-    return list(generator())
+    for args in args_cases:
+        yield SampleInput(make_input((S, S, S)), args=args)
 
 
 def sample_inputs_linalg_det(op_info, device, dtype, requires_grad):
@@ -1575,15 +1572,12 @@ def sample_inputs_cosine_similarity(op_info, device, dtype, requires_grad, **kwa
         ((S, S), {})
     )
 
-    def generator():
-        for input_shape, kwargs in cases:
-            yield SampleInput(make_arg(input_shape), args=(make_arg(input_shape),), kwargs=kwargs)
-        # Test for Broadcasting
-        yield SampleInput(make_arg((1, 2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -1})
-        yield SampleInput(make_arg((1, 2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -2})
-        yield SampleInput(make_arg((2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -1})
-
-    return list(generator())
+    for input_shape, kwargs in cases:
+        yield SampleInput(make_arg(input_shape), args=(make_arg(input_shape),), kwargs=kwargs)
+    # Test for Broadcasting
+    yield SampleInput(make_arg((1, 2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -1})
+    yield SampleInput(make_arg((1, 2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -2})
+    yield SampleInput(make_arg((2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -1})
 
 def sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -1601,48 +1595,45 @@ def sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs):
         ((2, 1), {}),
     )
 
-    def generator():
-        for input_shape, kwargs in cases:
-            # args: running mean, running var, weight and bias should necessarily be of shape: (channels,)
-            channels = input_shape[1] if len(input_shape) > 1 else 0
-            weight = make_arg(channels) if channels > 0 else None
-            bias = make_arg(channels) if channels > 0 else None
-            running_mean = make_arg_without_requires_grad(channels, low=0)
-            running_var = make_arg_without_requires_grad(channels, low=0)
+    for input_shape, kwargs in cases:
+        # args: running mean, running var, weight and bias should necessarily be of shape: (channels,)
+        channels = input_shape[1] if len(input_shape) > 1 else 0
+        weight = make_arg(channels) if channels > 0 else None
+        bias = make_arg(channels) if channels > 0 else None
+        running_mean = make_arg_without_requires_grad(channels, low=0)
+        running_var = make_arg_without_requires_grad(channels, low=0)
 
-            yield SampleInput(
-                make_arg(input_shape),
-                args=(
-                    running_mean,
-                    running_var,
-                    weight,
-                    bias
-                ),
-                kwargs=kwargs
-            )
+        yield SampleInput(
+            make_arg(input_shape),
+            args=(
+                running_mean,
+                running_var,
+                weight,
+                bias
+            ),
+            kwargs=kwargs
+        )
 
-        # Checking for permutations of weights and biases as `None`
-        weights = [channels, None, None]
-        biases = [None, channels, None]
-        is_training = [True, False, False]
+    # Checking for permutations of weights and biases as `None`
+    weights = [channels, None, None]
+    biases = [None, channels, None]
+    is_training = [True, False, False]
 
-        for weight, bias, training in zip(weights, biases, is_training):
-            yield SampleInput(
-                make_arg(input_shape),
-                args=(
-                    running_mean,
-                    running_var,
-                    make_arg(channels),
-                    make_arg(channels)
-                ),
-                kwargs={'training': training}
-            )
+    for weight, bias, training in zip(weights, biases, is_training):
+        yield SampleInput(
+            make_arg(input_shape),
+            args=(
+                running_mean,
+                running_var,
+                make_arg(channels),
+                make_arg(channels)
+            ),
+            kwargs={'training': training}
+        )
 
-        # Test case for no optional kwargs
-        # running_mean and running_var are required in evaluation mode (training: False) but not in training mode
-        yield SampleInput(make_arg((1, 2, 3)), args=(None, None), kwargs={'training': True})
-
-    return list(generator())
+    # Test case for no optional kwargs
+    # running_mean and running_var are required in evaluation mode (training: False) but not in training mode
+    yield SampleInput(make_arg((1, 2, 3)), args=(None, None), kwargs={'training': True})
 
 def sample_inputs_nn_activation_relu(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -1654,11 +1645,8 @@ def sample_inputs_nn_activation_relu(op_info, device, dtype, requires_grad, **kw
         ((S, M, S))
     )
 
-    def generator():
-        for shape in cases:
-            yield SampleInput(make_arg(shape))
-
-    return list(generator())
+    for shape in cases:
+        yield SampleInput(make_arg(shape))
 
 def sample_inputs_nn_functional_prelu(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -1670,17 +1658,14 @@ def sample_inputs_nn_functional_prelu(op_info, device, dtype, requires_grad, **k
         ((S, M, S))
     )
 
-    def generator():
-        for shape in cases:
-            for weight in [-1., 0., 0.8, 1.]:
-                weight_tensor = torch.tensor(weight, device=device, dtype=dtype, requires_grad=requires_grad)
-                yield SampleInput(make_arg(shape), kwargs=dict(weight=weight_tensor))
+    for shape in cases:
+        for weight in [-1., 0., 0.8, 1.]:
+            weight_tensor = torch.tensor(weight, device=device, dtype=dtype, requires_grad=requires_grad)
+            yield SampleInput(make_arg(shape), kwargs=dict(weight=weight_tensor))
 
-            if len(shape) >= 2:
-                channel_size = shape[1]
-                yield SampleInput(make_arg(shape), kwargs=dict(weight=make_arg((channel_size,))))
-
-    return list(generator())
+        if len(shape) >= 2:
+            channel_size = shape[1]
+            yield SampleInput(make_arg(shape), kwargs=dict(weight=make_arg((channel_size,))))
 
 def sample_inputs_norm(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -1730,14 +1715,11 @@ def sample_inputs_norm(op_info, device, dtype, requires_grad, **kwargs):
         new_args[1] *= -1
         cases_negdim.append((shape, tuple(new_args), name.replace("_dim", "_neg_dim")))
 
-    def generator():
-        for shape, args, name in itertools.chain(cases, cases_negdim):
-            yield SampleInput(make_arg(shape), args=args, name=name)
+    for shape, args, name in itertools.chain(cases, cases_negdim):
+        yield SampleInput(make_arg(shape), args=args, name=name)
 
-        for shape, args, name in cases_nonzero_input:
-            yield SampleInput(make_arg(shape, exclude_zero=True), args=args, name=name)
-
-    return list(generator())
+    for shape, args, name in cases_nonzero_input:
+        yield SampleInput(make_arg(shape, exclude_zero=True), args=args, name=name)
 
 
 def sample_inputs_norm_fro(op_info, device, dtype, requires_grad, **kwargs):
@@ -1749,11 +1731,8 @@ def sample_inputs_norm_fro(op_info, device, dtype, requires_grad, **kwargs):
         ((S, S), ('fro', [0, 1],), 'fro'),
     )
 
-    def generator():
-        for shape, args, name in cases:
-            yield SampleInput(make_arg(shape), args=args, name=name)
-
-    return list(generator())
+    for shape, args, name in cases:
+        yield SampleInput(make_arg(shape), args=args, name=name)
 
 
 def sample_inputs_norm_nuc(op_info, device, dtype, requires_grad, **kwargs):
@@ -1764,11 +1743,8 @@ def sample_inputs_norm_nuc(op_info, device, dtype, requires_grad, **kwargs):
         ((S, S, S), ('nuc', [1, 2]), 'nuc_batched'),
     )
 
-    def generator():
-        for shape, args, name in cases:
-            yield SampleInput(make_arg(shape), args=args, name=name)
-
-    return list(generator())
+    for shape, args, name in cases:
+        yield SampleInput(make_arg(shape), args=args, name=name)
 
 
 def sample_inputs_norm_inf(op_info, device, dtype, requires_grad, **kwargs):
@@ -1781,11 +1757,8 @@ def sample_inputs_norm_inf(op_info, device, dtype, requires_grad, **kwargs):
         ((S, S), (inf, -1,), 'inf_2_neg_dim'),
     )
 
-    def generator():
-        for shape, args, name in cases:
-            yield SampleInput(make_arg(shape), args=args, name=name)
-
-    return list(generator())
+    for shape, args, name in cases:
+        yield SampleInput(make_arg(shape), args=args, name=name)
 
 
 def sample_inputs_linalg_vector_norm(op_info, device, dtype, requires_grad, **kwargs):
@@ -2169,13 +2142,10 @@ def sample_inputs_addmv(op_info, device, dtype, requires_grad, **kwargs):
 
     cases = test_cases + test_cases_with_broadcast
 
-    def generator():
-        # addmv performs: beta * M + alpha * (mat @ vec)
-        for M, mat, vec, beta, alpha, broadcasts_input in cases:
-            yield SampleInput(make_arg(M), args=(make_arg(mat), make_arg(vec)),
-                              kwargs=dict(beta=beta, alpha=alpha), broadcasts_input=broadcasts_input)
-
-    return list(generator())
+    # addmv performs: beta * M + alpha * (mat @ vec)
+    for size, mat, vec, beta, alpha, broadcasts_input in cases:
+        yield SampleInput(make_arg(size), args=(make_arg(mat), make_arg(vec)),
+                          kwargs=dict(beta=beta, alpha=alpha), broadcasts_input=broadcasts_input)
 
 def sample_inputs_addbmm(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -2189,16 +2159,13 @@ def sample_inputs_addbmm(op_info, device, dtype, requires_grad, **kwargs):
                   ((), (S, S, S), (S, S, M), 0.6, 0.2, True),
                   ]
 
-    def generator():
-        for input_shape, batch1_shape, batch2_shape, beta, alpha, is_broadcasting in test_cases:
-            if dtype.is_complex:
-                beta_complex, alpha_complex = beta * (1 + 2j), alpha * (2 + 3j)
-                yield SampleInput(make_arg(input_shape), args=(make_arg(batch1_shape), make_arg(batch2_shape)),
-                                  kwargs=dict(beta=beta_complex, alpha=alpha_complex), broadcasts_input=is_broadcasting)
+    for input_shape, batch1_shape, batch2_shape, beta, alpha, is_broadcasting in test_cases:
+        if dtype.is_complex:
+            beta_complex, alpha_complex = beta * (1 + 2j), alpha * (2 + 3j)
             yield SampleInput(make_arg(input_shape), args=(make_arg(batch1_shape), make_arg(batch2_shape)),
-                              kwargs=dict(beta=beta, alpha=alpha), broadcasts_input=is_broadcasting)
-
-    return list(generator())
+                              kwargs=dict(beta=beta_complex, alpha=alpha_complex), broadcasts_input=is_broadcasting)
+        yield SampleInput(make_arg(input_shape), args=(make_arg(batch1_shape), make_arg(batch2_shape)),
+                          kwargs=dict(beta=beta, alpha=alpha), broadcasts_input=is_broadcasting)
 
 def sample_inputs_addcmul_addcdiv(op_info, device, dtype, requires_grad, **kwargs):
     test_cases = [(((S, S), (S, S), (S, S)), False),
@@ -2329,45 +2296,39 @@ def sample_inputs_xlogy(self, device, dtype, requires_grad, **kwargs):
 def sample_inputs_xlog1py(self, device, dtype, requires_grad):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
-    def generator():
-        # same shape
-        yield SampleInput(make_arg((S, S)), args=(make_arg((S, S), low=-1),))
-        # rhs broadcast
-        yield SampleInput(make_arg((S, S)), args=(make_arg((S,), low=-1),))
-        # all zero `x`
-        with torch.no_grad():
-            x = make_arg((S, S))
-            x.fill_(0)
-        yield SampleInput(x, args=(make_arg((S, S), low=-1),))
-
-        # randomly zero-masked `x`
+    # same shape
+    yield SampleInput(make_arg((S, S)), args=(make_arg((S, S), low=-1),))
+    # rhs broadcast
+    yield SampleInput(make_arg((S, S)), args=(make_arg((S,), low=-1),))
+    # all zero `x`
+    with torch.no_grad():
         x = make_arg((S, S))
-        y = make_arg((S, S), low=-1)
-        with torch.no_grad():
-            x[torch.rand(x.shape) > 0.5] = 0
-        yield SampleInput(x, args=(y,))
+        x.fill_(0)
+    yield SampleInput(x, args=(make_arg((S, S), low=-1),))
 
-        # Scalar x
-        # `input` has to be a tensor
-        # yield SampleInput(0, args=(make_arg((S, S), low=-1),))
-        # yield SampleInput(2.1, args=(make_arg((S, S), low=-1),))
+    # randomly zero-masked `x`
+    x = make_arg((S, S))
+    y = make_arg((S, S), low=-1)
+    with torch.no_grad():
+        x[torch.rand(x.shape) > 0.5] = 0
+    yield SampleInput(x, args=(y,))
 
-        # Scalar y
-        yield SampleInput(make_arg((S, S)), args=(-0.5,))
-        yield SampleInput(make_arg((S, S)), args=(1.2,))
+    # Scalar x
+    # `input` has to be a tensor
+    # yield SampleInput(0, args=(make_arg((S, S), low=-1),))
+    # yield SampleInput(2.1, args=(make_arg((S, S), low=-1),))
 
-    return list(generator())
+    # Scalar y
+    yield SampleInput(make_arg((S, S)), args=(-0.5,))
+    yield SampleInput(make_arg((S, S)), args=(1.2,))
 
 def sample_inputs_zero_(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
     cases = ((), (S, S, S), (S,))
 
-    def generator():
-        for shape in cases:
-            yield(SampleInput(make_arg(shape)))
-
-    return list(generator())
+    for shape in cases:
+        yield(SampleInput(make_arg(shape)))
 
 
 def sample_inputs_logsumexp(self, device, dtype, requires_grad):
@@ -2597,11 +2558,8 @@ def sample_inputs_renorm(self, device, dtype, requires_grad, **kwargs):
              ((S, S, S), (float('inf'), 2, 0.5)),
              )
 
-    def generator():
-        for shape, args in cases:
-            yield SampleInput(make_arg(shape), args=args)
-
-    return list(generator())
+    for shape, args in cases:
+        yield SampleInput(make_arg(shape), args=args)
 
 
 def sample_inputs_transpose_swapdims(self, device, dtype, requires_grad, **kwargs):
@@ -2616,23 +2574,20 @@ def sample_inputs_transpose_swapdims(self, device, dtype, requires_grad, **kwarg
              ((M, M), (0, 1)),
              ((S, S, S), (2, 0)), )
 
-    def generator():
-        for shape, args in cases:
-            yield SampleInput(make_arg(shape), args=args)
-
-    return list(generator())
+    for shape, args in cases:
+        yield SampleInput(make_arg(shape), args=args)
 
 def sample_inputs_adjoint(self, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
 
     shapes = ((1, 2, 3), (), (M, M), (S, S, S), (S, M, S), (M, S, M, S))
-    return list(SampleInput(make_arg(shape)) for shape in shapes)
+    return (SampleInput(make_arg(shape)) for shape in shapes)
 
 def sample_inputs_T(self, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
 
     shapes = ((), (M, M))
-    return list(SampleInput(make_arg(shape)) for shape in shapes)
+    return (SampleInput(make_arg(shape)) for shape in shapes)
 
 
 def sample_inputs_linalg_invertible(op_info, device, dtype, requires_grad=False, **kwargs):
@@ -2672,16 +2627,13 @@ def sample_inputs_linalg_pinv_singular(op_info, device, dtype, requires_grad=Fal
     # of the pinv's backward method, albeit it is slow.
     size = [0, 3, 50]
 
-    def generate_samples():
-        for batch, m, n in product(batches, size, size):
-            for k in range(min(3, min(m, n))):
-                # Note that by making the columns of `a` and `b` orthonormal we make sure that
-                # the product matrix `a @ b.t()` has condition number 1 when restricted to its image
-                a = torch.rand(*batch, m, k, device=device, dtype=dtype).qr().Q.requires_grad_(requires_grad)
-                b = torch.rand(*batch, n, k, device=device, dtype=dtype).qr().Q.requires_grad_(requires_grad)
-                yield SampleInput(a, args=(b,))
-
-    return list(generate_samples())
+    for batch, m, n in product(batches, size, size):
+        for k in range(min(3, min(m, n))):
+            # Note that by making the columns of `a` and `b` orthonormal we make sure that
+            # the product matrix `a @ b.t()` has condition number 1 when restricted to its image
+            a = torch.rand(*batch, m, k, device=device, dtype=dtype).qr().Q.requires_grad_(requires_grad)
+            b = torch.rand(*batch, n, k, device=device, dtype=dtype).qr().Q.requires_grad_(requires_grad)
+            yield SampleInput(a, args=(b,))
 
 
 def sample_inputs_singular_matrix_factors(op_info, device, dtype, requires_grad=False, **kwargs):
@@ -2791,11 +2743,8 @@ def sample_inputs_linalg_cond(op_info, device, dtype, requires_grad=False, **kwa
               (2, S, S),
               (2, 1, S, S), )
 
-    def generator():
-        for shape in shapes:
-            yield SampleInput(make_arg(shape))
-
-    return list(generator())
+    for shape in shapes:
+        yield SampleInput(make_arg(shape))
 
 def np_sinc_with_fp16_as_fp32(x):
     # Wraps numpy's sinc function so that fp16 values are promoted to fp32
@@ -2905,11 +2854,8 @@ def sample_inputs_fill_(op_info, device, dtype, requires_grad, **kwargs):
              # check https://github.com/pytorch/pytorch/issues/59137
              ((S, S, S), (make_arg((), requires_grad=False),)))
 
-    def generator():
-        for shape, args in cases:
-            yield SampleInput(make_arg(shape), args=args)
-
-    return list(generator())
+    for shape, args in cases:
+        yield SampleInput(make_arg(shape), args=args)
 
 
 def sample_inputs_comparison_ops(self, device, dtype, requires_grad, **kwargs):
@@ -2980,11 +2926,8 @@ def sample_inputs_cat_concat(op_info, device, dtype, requires_grad, **kwargs):
         ((1,), (1,), {})  # dim not passed, fallback to default
     )
 
-    def generator():
-        for input_shape1, input_shape2, kwargs in cases:
-            yield SampleInput([make_arg(input_shape1), make_arg(input_shape2)], kwargs=kwargs)
-
-    return list(generator())
+    for input_shape1, input_shape2, kwargs in cases:
+        yield SampleInput([make_arg(input_shape1), make_arg(input_shape2)], kwargs=kwargs)
 
 def sample_inputs_hstack_dstack_vstack(op_info, device, dtype, requires_grad, **kwargs):
     tensors = [
@@ -3438,13 +3381,10 @@ def sample_inputs_unique(op_info, device, dtype, requires_grad, **kwargs):
     return sample_inputs
 
 def sample_inputs_unique_consecutive(*args, **kwargs):
-    def generator():
-        for sample_input in sample_inputs_unique(*args, **kwargs):
-            if not sample_input.kwargs["sorted"]:
-                sample_input.kwargs.pop("sorted")
-                yield sample_input
-
-    return list(generator())
+    for sample_input in sample_inputs_unique(*args, **kwargs):
+        if not sample_input.kwargs["sorted"]:
+            sample_input.kwargs.pop("sorted")
+            yield sample_input
 
 def sample_inputs_index_fill(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -3509,11 +3449,8 @@ def sample_inputs_adaptive_avg_pool1d(op_info, device, dtype, requires_grad, **k
         ((3, 8, 8), 1)
     )
 
-    def generator():
-        for input_shape, output_size in cases:
-            yield SampleInput(make_arg(input_shape), args=(output_size,))
-
-    return list(generator())
+    for input_shape, output_size in cases:
+        yield SampleInput(make_arg(input_shape), args=(output_size,))
 
 def sample_inputs_adaptive_avg_pool2d(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -3527,11 +3464,8 @@ def sample_inputs_adaptive_avg_pool2d(op_info, device, dtype, requires_grad, **k
         ((1, 8, 4, 3), (5)),
     )
 
-    def generator():
-        for input_shape, output_size in cases:
-            yield SampleInput(make_arg(input_shape), args=(output_size,))
-
-    return list(generator())
+    for input_shape, output_size in cases:
+        yield SampleInput(make_arg(input_shape), args=(output_size,))
 
 
 def sample_inputs_adaptive_avg_pool3d(op_info, device, dtype, requires_grad, **kwargs):
@@ -3547,11 +3481,8 @@ def sample_inputs_adaptive_avg_pool3d(op_info, device, dtype, requires_grad, **k
         ((3, 3, 8, 8, 6), (None, 3, 2)),
     )
 
-    def generator():
-        for input_shape, output_size in cases:
-            yield SampleInput(make_arg(input_shape), args=(output_size,))
-
-    return list(generator())
+    for input_shape, output_size in cases:
+        yield SampleInput(make_arg(input_shape), args=(output_size,))
 
 def sample_inputs_adaptive_max_pool1d(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -3564,11 +3495,8 @@ def sample_inputs_adaptive_max_pool1d(op_info, device, dtype, requires_grad, **k
         ((3, 4, 4), 1)
     )
 
-    def generator():
-        for shapes, return_idx in product(cases, (True, False)):
-            yield SampleInput(make_arg(shapes[0]), args=(shapes[1], return_idx))
-
-    return list(generator())
+    for shapes, return_idx in product(cases, (True, False)):
+        yield SampleInput(make_arg(shapes[0]), args=(shapes[1], return_idx))
 
 def sample_inputs_adaptive_max_pool2d(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -3585,11 +3513,8 @@ def sample_inputs_adaptive_max_pool2d(op_info, device, dtype, requires_grad, **k
         ((1, 4, 4, 3), (3)),
     )
 
-    def generator():
-        for shapes, return_idx in product(cases, (True, False)):
-            yield SampleInput(make_arg(shapes[0]), args=(shapes[1], return_idx))
-
-    return list(generator())
+    for shapes, return_idx in product(cases, (True, False)):
+        yield SampleInput(make_arg(shapes[0]), args=(shapes[1], return_idx))
 
 
 def sample_inputs_adaptive_max_pool3d(op_info, device, dtype, requires_grad, **kwargs):
@@ -3606,11 +3531,8 @@ def sample_inputs_adaptive_max_pool3d(op_info, device, dtype, requires_grad, **k
         ((3, 3, 4, 4, 6), (None, 3, 2)),
     )
 
-    def generator():
-        for shapes, return_idx in product(cases, (True, False)):
-            yield SampleInput(make_arg(shapes[0]), args=(shapes[1], return_idx))
-
-    return list(generator())
+    for shapes, return_idx in product(cases, (True, False)):
+        yield SampleInput(make_arg(shapes[0]), args=(shapes[1], return_idx))
 
 class _TestParamsMaxPoolBase(object):
 
@@ -3689,13 +3611,10 @@ def sample_inputs_max_pool(op_info, device, dtype, requires_grad, **kwargs):
         'nn.functional.max_pool3d': _TestParamsMaxPool3d,
     }
 
-    def generator():
-        params_generator = params_generator_type_dict[op_info.name]()
-        for (shape, memory_format), kwargs in params_generator.gen_input_params():
-            arg = make_arg(shape).to(memory_format=memory_format).requires_grad_(requires_grad)
-            yield SampleInput(arg, kwargs=kwargs)
-
-    return list(generator())
+    params_generator = params_generator_type_dict[op_info.name]()
+    for (shape, memory_format), kwargs in params_generator.gen_input_params():
+        arg = make_arg(shape).to(memory_format=memory_format).requires_grad_(requires_grad)
+        yield SampleInput(arg, kwargs=kwargs)
 
 def sample_inputs_normalize(self, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, low=-1, high=1, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -3710,11 +3629,8 @@ def sample_inputs_normalize(self, device, dtype, requires_grad, **kwargs):
                                      ((2, 3, 4, 5), {}),
                                      ((2, 3, 4, 5), {'eps': 1e-4}))
 
-    def generator():
-        for input_shape, kwargs in cases:
-            yield SampleInput(make_arg(input_shape), kwargs=kwargs)
-
-    return list(generator())
+    for input_shape, kwargs in cases:
+        yield SampleInput(make_arg(input_shape), kwargs=kwargs)
 
 def sample_inputs_conv_transpose1d(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -3761,14 +3677,11 @@ def sample_inputs_conv_transpose2d(op_info, device, dtype, requires_grad, **kwar
          {})
     )
 
-    def generator():
-        for input_shape, weight, bias, kwargs in cases:
-            yield SampleInput(make_arg(input_shape), args=(
-                make_arg(weight),
-                make_arg(bias) if bias is not None else bias
-            ), kwargs=kwargs)
-
-    return list(generator())
+    for input_shape, weight, bias, kwargs in cases:
+        yield SampleInput(make_arg(input_shape), args=(
+            make_arg(weight),
+            make_arg(bias) if bias is not None else bias
+        ), kwargs=kwargs)
 
 def sample_inputs_conv_transpose3d(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -3816,14 +3729,11 @@ def sample_inputs_conv1d(op_info, device, dtype, requires_grad, **kwargs):
     # Should replace test_conv_modules_raise_error_on_incorrect_input_size and test_conv_shapecheck
     # in test/test_nn.py
 
-    def generator():
-        for input_shape, weight, bias, kwargs in cases:
-            yield SampleInput(make_arg(input_shape), args=(
-                make_arg(weight),
-                make_arg(bias) if bias is not None else bias
-            ), kwargs=kwargs)
-
-    return list(generator())
+    for input_shape, weight, bias, kwargs in cases:
+        yield SampleInput(make_arg(input_shape), args=(
+            make_arg(weight),
+            make_arg(bias) if bias is not None else bias
+        ), kwargs=kwargs)
 
 
 def sample_inputs_conv2d(op_info, device, dtype, requires_grad, jit_fail_sample=False, **kwargs):
@@ -3858,14 +3768,11 @@ def sample_inputs_conv2d(op_info, device, dtype, requires_grad, jit_fail_sample=
         ((1, 4, 5, 5), (3, 4, 3, 3), None, {}),
     )
 
-    def generator():
-        for input_shape, weight, bias, kwargs in cases:
-            yield SampleInput(make_arg(input_shape), args=(
-                make_arg(weight),
-                make_arg(bias) if bias is not None else bias
-            ), kwargs=kwargs)
-
-    return list(generator())
+    for input_shape, weight, bias, kwargs in cases:
+        yield SampleInput(make_arg(input_shape), args=(
+            make_arg(weight),
+            make_arg(bias) if bias is not None else bias
+        ), kwargs=kwargs)
 
 
 def sample_inputs_group_norm(opinfo, device, dtype, requires_grad, **kwargs):
@@ -3879,21 +3786,18 @@ def sample_inputs_group_norm(opinfo, device, dtype, requires_grad, **kwargs):
         ((0, 2), 1, None),
     )
 
-    def generator():
-        for input_shape, num_groups, eps in cases:
-            # Shape of weight and bias should be the same as num_channels
-            weight = make_arg(input_shape[1])
-            bias = make_arg(input_shape[1])
-            kwargs = {'weight': weight, 'bias': bias} if eps is None else {'weight': weight, 'bias': bias, 'eps': eps}
-            yield SampleInput(
-                make_arg(input_shape),
-                args=(num_groups,),
-                kwargs=kwargs
-            )
-        # Without any optional args
-        yield SampleInput(make_arg((1, 2)), args=(1,))
-
-    return list(generator())
+    for input_shape, num_groups, eps in cases:
+        # Shape of weight and bias should be the same as num_channels
+        weight = make_arg(input_shape[1])
+        bias = make_arg(input_shape[1])
+        kwargs = {'weight': weight, 'bias': bias} if eps is None else {'weight': weight, 'bias': bias, 'eps': eps}
+        yield SampleInput(
+            make_arg(input_shape),
+            args=(num_groups,),
+            kwargs=kwargs
+        )
+    # Without any optional args
+    yield SampleInput(make_arg((1, 2)), args=(1,))
 
 
 def sample_inputs_instance_norm(opinfo, device, dtype, requires_grad, **kwargs):
@@ -3910,51 +3814,48 @@ def sample_inputs_instance_norm(opinfo, device, dtype, requires_grad, **kwargs):
         ((3, 2, 3, 4), {'momentum': -1.0, 'eps': 0.5}),
     )
 
-    def generator():
-        for input_shape, kwargs in cases:
-            # args: running mean, running var, weight and bias should necessarily be of shape: (channels,)
-            channels = input_shape[1]
-            weight = make_arg(channels)
-            bias = make_arg(channels)
-            running_mean = make_arg_without_requires_grad(channels, low=0)
-            running_var = make_arg_without_requires_grad(channels, low=0)
-            new_kwargs = {
+    for input_shape, kwargs in cases:
+        # args: running mean, running var, weight and bias should necessarily be of shape: (channels,)
+        channels = input_shape[1]
+        weight = make_arg(channels)
+        bias = make_arg(channels)
+        running_mean = make_arg_without_requires_grad(channels, low=0)
+        running_var = make_arg_without_requires_grad(channels, low=0)
+        new_kwargs = {
+            'running_mean': running_mean,
+            'running_var': running_var,
+            'weight': weight,
+            'bias': bias,
+            **kwargs
+        }
+
+        yield SampleInput(
+            make_arg(input_shape),
+            args=(),
+            kwargs=new_kwargs
+        )
+
+    # Checking for permutations of weights and biases as `None`
+    # instance_norm assumes that if there's a bias, there's a weight
+    weights = [channels, None]
+    biases = [None, None]
+
+    for weight_channels, bias_channels in zip(weights, biases):
+        running_mean = make_arg_without_requires_grad(channels, low=0)
+        running_var = make_arg_without_requires_grad(channels, low=0)
+        yield SampleInput(
+            make_arg(input_shape),
+            args=(),
+            kwargs={
                 'running_mean': running_mean,
                 'running_var': running_var,
-                'weight': weight,
-                'bias': bias,
-                **kwargs
+                'weight': make_arg(weight_channels) if weight_channels is not None else None,
+                'bias': make_arg(bias_channels) if bias_channels is not None else None
             }
+        )
 
-            yield SampleInput(
-                make_arg(input_shape),
-                args=(),
-                kwargs=new_kwargs
-            )
-
-        # Checking for permutations of weights and biases as `None`
-        # instance_norm assumes that if there's a bias, there's a weight
-        weights = [channels, None]
-        biases = [None, None]
-
-        for weight_channels, bias_channels in zip(weights, biases):
-            running_mean = make_arg_without_requires_grad(channels, low=0)
-            running_var = make_arg_without_requires_grad(channels, low=0)
-            yield SampleInput(
-                make_arg(input_shape),
-                args=(),
-                kwargs={
-                    'running_mean': running_mean,
-                    'running_var': running_var,
-                    'weight': make_arg(weight_channels) if weight_channels is not None else None,
-                    'bias': make_arg(bias_channels) if bias_channels is not None else None
-                }
-            )
-
-        # Test case for no optional kwargs
-        yield SampleInput(make_arg((1, 2, 3)), kwargs={})
-
-    return list(generator())
+    # Test case for no optional kwargs
+    yield SampleInput(make_arg((1, 2, 3)), kwargs={})
 
 
 def sample_inputs_layer_norm(opinfo, device, dtype, requires_grad, **kwargs):
@@ -3969,29 +3870,26 @@ def sample_inputs_layer_norm(opinfo, device, dtype, requires_grad, **kwargs):
         ((0, 1), (1,), {}),
     )
 
-    def generator():
-        for input_shape, normalized_shape, kwargs in cases:
-            # Shape of weight and bias should be the same as normalized_shape
-            weight = make_arg(normalized_shape)
-            bias = make_arg(normalized_shape)
-            yield SampleInput(
-                make_arg(input_shape),
-                args=(normalized_shape, weight, bias),
-                kwargs=kwargs
-            )
-        # Without any optional args
-        yield SampleInput(make_arg((1, 2)), args=((2,),))
+    for input_shape, normalized_shape, kwargs in cases:
+        # Shape of weight and bias should be the same as normalized_shape
+        weight = make_arg(normalized_shape)
+        bias = make_arg(normalized_shape)
+        yield SampleInput(
+            make_arg(input_shape),
+            args=(normalized_shape, weight, bias),
+            kwargs=kwargs
+        )
+    # Without any optional args
+    yield SampleInput(make_arg((1, 2)), args=((2,),))
 
-        # TODO: @krshrimali, once to_numpy method in SampleInput class is modified to take None inputs,
-        # enable these inputs; see https://github.com/pytorch/pytorch/pull/63276#discussion_r691950400
+    # TODO: @krshrimali, once to_numpy method in SampleInput class is modified to take None inputs,
+    # enable these inputs; see https://github.com/pytorch/pytorch/pull/63276#discussion_r691950400
 
-        # With weight and a `None` bias
-        # yield SampleInput(make_arg((1, 2)), args=((2,), make_arg((2,)), None))
+    # With weight and a `None` bias
+    # yield SampleInput(make_arg((1, 2)), args=((2,), make_arg((2,)), None))
 
-        # With `None` weight and bias (tests failing for this, see the link above)
-        # yield SampleInput(make_arg((1, 2)), args=((2,), None, make_arg((2,))))
-
-    return list(generator())
+    # With `None` weight and bias (tests failing for this, see the link above)
+    # yield SampleInput(make_arg((1, 2)), args=((2,), None, make_arg((2,))))
 
 
 def sample_inputs_local_response_norm(opinfo, device, dtype, requires_grad, **kwargs):
@@ -4012,11 +3910,8 @@ def sample_inputs_local_response_norm(opinfo, device, dtype, requires_grad, **kw
         ((0, 1, 2), 1, {'alpha': 3e-05, 'beta': 0.5, 'k': 1.25}),
     )
 
-    def generator():
-        for input_shape, size, kwargs in cases:
-            yield SampleInput(make_arg(input_shape), args=(size,), kwargs=kwargs)
-
-    return list(generator())
+    for input_shape, size, kwargs in cases:
+        yield SampleInput(make_arg(input_shape), args=(size,), kwargs=kwargs)
 
 
 def sample_inputs_hardswish(self, device, dtype, requires_grad):
@@ -4351,14 +4246,11 @@ def sample_inputs_avgpool2d(op_info, device, dtype, requires_grad, **kwargs):
              ((1, 1, 4, 4), (2, 2), (), (0, ), False, True, -2),
              ((1, 2, 6, 6), (4, 4), (2, 2), (2, ), True, True, None))
 
-    def generator():
-        for input_shape, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override in cases:
-            yield SampleInput(make_arg(input_shape),
-                              args=(kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override))
-        # Case with just input_shape and kernel_size
-        yield SampleInput(make_arg((1, 3, 9, 9)), args=((3, 3)))
-
-    return list(generator())
+    for input_shape, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override in cases:
+        yield SampleInput(make_arg(input_shape),
+                          args=(kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override))
+    # Case with just input_shape and kernel_size
+    yield SampleInput(make_arg((1, 3, 9, 9)), args=((3, 3)))
 
 def sample_inputs_avgpool1d(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -4376,11 +4268,8 @@ def sample_inputs_avgpool1d(op_info, device, dtype, requires_grad, **kwargs):
         ((1, 2, 9), (7,), dict(stride=(3,), ceil_mode=True)),
     ]
 
-    def generator():
-        for input_shape, kernel_size, kwargs in cases:
-            yield SampleInput(make_arg(input_shape), args=(kernel_size,), kwargs=kwargs)
-
-    return list(generator())
+    for input_shape, kernel_size, kwargs in cases:
+        yield SampleInput(make_arg(input_shape), args=(kernel_size,), kwargs=kwargs)
 
 def sample_inputs_avgpool3d(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -4403,11 +4292,8 @@ def sample_inputs_avgpool3d(op_info, device, dtype, requires_grad, **kwargs):
                                           count_include_pad=True, divisor_override=None)),
     ]
 
-    def generator():
-        for input_shape, kernel_size, kwargs in cases:
-            yield SampleInput(make_arg(input_shape), args=(kernel_size,), kwargs=kwargs)
-
-    return list(generator())
+    for input_shape, kernel_size, kwargs in cases:
+        yield SampleInput(make_arg(input_shape), args=(kernel_size,), kwargs=kwargs)
 
 def sample_inputs_topk(op_info, device, dtype, requires_grad, **kwargs):
     def get_tensor_input(size):
@@ -4447,13 +4333,10 @@ def sample_inputs_igamma_igammac(op_info, device, dtype, requires_grad, **kwargs
              ((S, ), (S, S), True),
              ((), (), False))
 
-    def generator():
-        for shape, other_shape, broadcasts_input in cases:
-            yield SampleInput(make_arg(shape, requires_grad=requires_grad),
-                              args=(make_arg(other_shape, requires_grad=False),),
-                              broadcasts_input=broadcasts_input)
-
-    return list(generator())
+    for shape, other_shape, broadcasts_input in cases:
+        yield SampleInput(make_arg(shape, requires_grad=requires_grad),
+                          args=(make_arg(other_shape, requires_grad=False),),
+                          broadcasts_input=broadcasts_input)
 
 
 def sample_inputs_dist(op_info, device, dtype, requires_grad):
@@ -4461,11 +4344,8 @@ def sample_inputs_dist(op_info, device, dtype, requires_grad):
     sizes = ((S, S, S), (S,), (S, 1, S), (), (S, S))
     ps = (2, 4)
 
-    def generate_samples():
-        for size_x, size_y, p in product(sizes, sizes, ps):
-            yield SampleInput(make_arg(size_x), args=(make_arg(size_y), p))
-
-    return list(generate_samples())
+    for size_x, size_y, p in product(sizes, sizes, ps):
+        yield SampleInput(make_arg(size_x), args=(make_arg(size_y), p))
 
 # Missing to test the nondeterminism of the operation
 # https://github.com/pytorch/pytorch/issues/53352
@@ -4516,39 +4396,36 @@ def sample_inputs_put(op_info, device, dtype, requires_grad):
 
     S = 3
 
-    def gen_inputs():
-        # Generic inputs
-        idx = torch.randperm(S * S, device=device, dtype=torch.int64)[:S]
-        idx_list = [idx, -idx - 1]
-        for idx, acc in product(idx_list, (True, False)):
-            yield SampleInput(input=make_arg((S, S)),
-                              args=(idx.detach().clone(),
-                                    make_arg((S,)),
-                                    acc))
+    # Generic inputs
+    idx = torch.randperm(S * S, device=device, dtype=torch.int64)[:S]
+    idx_list = [idx, -idx - 1]
+    for idx, acc in product(idx_list, (True, False)):
+        yield SampleInput(input=make_arg((S, S)),
+                          args=(idx.detach().clone(),
+                                make_arg((S,)),
+                                acc))
 
-        # Scalar cases
-        scalar_sizes = [(), (1,)]
-        tgt_gen = (make_arg(size) for size in scalar_sizes)
-        idx_gen = (make_idx(size, high=1) for size in scalar_sizes)
-        src_gen = (make_arg(size) for size in scalar_sizes)
-        for tgt, idx, src, acc in product(tgt_gen, idx_gen, src_gen, (True, False)):
-            yield SampleInput(input=tgt.detach().clone().requires_grad_(requires_grad),
-                              args=(idx.detach().clone(),
-                                    src.detach().clone().requires_grad_(requires_grad),
-                                    acc))
+    # Scalar cases
+    scalar_sizes = [(), (1,)]
+    tgt_gen = (make_arg(size) for size in scalar_sizes)
+    idx_gen = (make_idx(size, high=1) for size in scalar_sizes)
+    src_gen = (make_arg(size) for size in scalar_sizes)
+    for tgt, idx, src, acc in product(tgt_gen, idx_gen, src_gen, (True, False)):
+        yield SampleInput(input=tgt.detach().clone().requires_grad_(requires_grad),
+                          args=(idx.detach().clone(),
+                                src.detach().clone().requires_grad_(requires_grad),
+                                acc))
 
-        # Empty cases
-        tgt_sizes = [(0,), (), (1,), (3, 2)]
-        tgt_gen = (make_arg(size) for size in tgt_sizes)
-        idx = make_idx((0,), high=1)
-        src = make_arg((0,))
-        for tgt, acc in product(tgt, (True, False)):
-            yield SampleInput(input=tgt.detach().clone().requires_grad_(requires_grad),
-                              args=(idx.detach().clone(),
-                                    src.detach().clone().requires_grad_(requires_grad),
-                                    acc))
-
-    return list(gen_inputs())
+    # Empty cases
+    tgt_sizes = [(0,), (), (1,), (3, 2)]
+    tgt_gen = (make_arg(size) for size in tgt_sizes)
+    idx = make_idx((0,), high=1)
+    src = make_arg((0,))
+    for tgt, acc in product(tgt, (True, False)):
+        yield SampleInput(input=tgt.detach().clone().requires_grad_(requires_grad),
+                          args=(idx.detach().clone(),
+                                src.detach().clone().requires_grad_(requires_grad),
+                                acc))
 
 def sample_inputs_take(op_info, device, dtype, requires_grad):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -4556,29 +4433,26 @@ def sample_inputs_take(op_info, device, dtype, requires_grad):
 
     S = 3
 
-    def gen_inputs():
-        # Generic inputs: take S elements out of S * S
-        index = make_idx((S,), high=(S * S))
-        for idx in (index, -index - 1):
-            yield SampleInput(input=make_arg((S, S)), args=(idx,))
+    # Generic inputs: take S elements out of S * S
+    index = make_idx((S,), high=(S * S))
+    for idx in (index, -index - 1):
+        yield SampleInput(input=make_arg((S, S)), args=(idx,))
 
-        # Scalar cases
-        scalar_sizes = [(), (1,)]
-        src_gen = (make_arg(size) for size in scalar_sizes)
-        idx_gen = (make_idx(size, high=1) for size in scalar_sizes)
-        for src, idx in product(src_gen, idx_gen):
-            yield SampleInput(input=src.detach().clone().requires_grad_(requires_grad),
-                              args=(idx.detach().clone(),))
+    # Scalar cases
+    scalar_sizes = [(), (1,)]
+    src_gen = (make_arg(size) for size in scalar_sizes)
+    idx_gen = (make_idx(size, high=1) for size in scalar_sizes)
+    for src, idx in product(src_gen, idx_gen):
+        yield SampleInput(input=src.detach().clone().requires_grad_(requires_grad),
+                          args=(idx.detach().clone(),))
 
-        # Empty cases
-        src_sizes = [(0,), (), (1,), (3, 2)]
-        src_gen = (make_arg(size) for size in src_sizes)
-        idx = make_idx((0,), high=1)
-        for src in src_gen:
-            yield SampleInput(input=src.detach().clone().requires_grad_(requires_grad),
-                              args=(idx.detach().clone(),))
-
-    return list(gen_inputs())
+    # Empty cases
+    src_sizes = [(0,), (), (1,), (3, 2)]
+    src_gen = (make_arg(size) for size in src_sizes)
+    idx = make_idx((0,), high=1)
+    for src in src_gen:
+        yield SampleInput(input=src.detach().clone().requires_grad_(requires_grad),
+                          args=(idx.detach().clone(),))
 
 def sample_movedim_moveaxis(op_info, device, dtype, requires_grad):
     return (
@@ -4622,13 +4496,10 @@ def sample_inputs_narrow(op_info, device, dtype, requires_grad, **kwargs):
         ((S, S, S), (-1, 0, 0)),
     )
 
-    def generator():
-        for shape, args in shapes_and_args:
-            tensor = make_tensor(shape, device, dtype, low=None, high=None,
-                                 requires_grad=requires_grad)
-            yield SampleInput(tensor, args=args)
-
-    return list(generator())
+    for shape, args in shapes_and_args:
+        tensor = make_tensor(shape, device, dtype, low=None, high=None,
+                             requires_grad=requires_grad)
+        yield SampleInput(tensor, args=args)
 
 def sample_trapezoid(op_info, device, dtype, requires_grad, **kwargs):
     y_shape_x_shape_and_kwargs = [
@@ -4711,17 +4582,14 @@ def sample_inputs_nn_unfold(op_info, device, dtype, requires_grad, **kwargs):
     paddings = (0, 1, (1, 1))
     strides = (1, 2, (1, 2))
 
-    def generator():
-        cases = product(shapes, kernel_sizes, dilations, paddings, strides)
-        for shape, kernel_size, dilation, padding, stride in cases:
-            tensor = make_tensor(shape, device, dtype, requires_grad=requires_grad)
-            yield SampleInput(tensor, args=(kernel_size, dilation, padding, stride))
+    cases = product(shapes, kernel_sizes, dilations, paddings, strides)
+    for shape, kernel_size, dilation, padding, stride in cases:
+        tensor = make_tensor(shape, device, dtype, requires_grad=requires_grad)
+        yield SampleInput(tensor, args=(kernel_size, dilation, padding, stride))
 
-        # With default args
-        yield SampleInput(make_tensor((1, 1, 5, 5), device, dtype, requires_grad=requires_grad),
-                          args=((3, 3),))
-
-    return list(generator())
+    # With default args
+    yield SampleInput(make_tensor((1, 1, 5, 5), device, dtype, requires_grad=requires_grad),
+                      args=((3, 3),))
 
 
 def sample_inputs_squeeze(op_info, device, dtype, requires_grad, **kwargs):
@@ -4735,14 +4603,11 @@ def sample_inputs_squeeze(op_info, device, dtype, requires_grad, **kwargs):
         ((), (0, )),
     )
 
-    def generator():
-        for shape, args in shapes_and_args:
-            tensor = make_tensor(shape, device, dtype, low=None, high=None,
-                                 requires_grad=requires_grad)
+    for shape, args in shapes_and_args:
+        tensor = make_tensor(shape, device, dtype, low=None, high=None,
+                             requires_grad=requires_grad)
 
-            yield SampleInput(tensor, args=args)
-
-    return list(generator())
+        yield SampleInput(tensor, args=args)
 
 
 def sample_inputs_nn_pad(op_info, device, dtype, requires_grad, mode, **kwargs):
@@ -4813,20 +4678,17 @@ def sample_inputs_nn_pad(op_info, device, dtype, requires_grad, mode, **kwargs):
 
     make_inp = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
-    def generator():
-        if mode == 'constant':
-            # Default args
-            yield SampleInput(make_inp((1, 3, 3)), args=((2, 2),))
+    if mode == 'constant':
+        # Default args
+        yield SampleInput(make_inp((1, 3, 3)), args=((2, 2),))
 
-        if mode in ['reflect', 'replicate', 'circular']:
+    if mode in ['reflect', 'replicate', 'circular']:
+        for shape, pad in cases:
+            yield SampleInput(make_inp(shape), args=(pad, mode))
+    else:  # mode == 'constant'
+        for pad_value in (1., 2.):
             for shape, pad in cases:
-                yield SampleInput(make_inp(shape), args=(pad, mode))
-        else:  # mode == 'constant'
-            for pad_value in (1., 2.):
-                for shape, pad in cases:
-                    yield SampleInput(make_inp(shape), args=(pad, mode, pad_value))
-
-    return list(generator())
+                yield SampleInput(make_inp(shape), args=(pad, mode, pad_value))
 
 
 # TODO: reconcile with torch.linalg.det and torch.linalg.slogdet
@@ -5378,39 +5240,36 @@ def sample_inputs_linalg_solve_triangular(op_info, device, dtype, requires_grad=
     ns = (3, 0)
     ks = (1, 3, 0)
 
-    def gen_inputs():
-        for b, n, k, (left, upper, uni) in product(bs, ns, ks, product((True, False), repeat=3)):
-            with torch.no_grad():
-                if b == 1:
-                    A = make_arg((n, n)) if left else make_arg((k, k))
-                    B = make_arg((n, k))
-                else:
-                    A = make_arg((b, n, n)) if left else make_arg((b, k, k))
-                    B = make_arg((b, n, k))
-                if uni:
-                    # Not really necessary, but writing it for consistency
-                    A.diagonal(0, -2, -1).fill_(1.)
-                else:
-                    d = A.diagonal(0, -2, -1)
-                    d[d.abs() < 1e-6] = 1.
-                if upper:
-                    A.triu_()
-                else:
-                    A.tril_()
-            kwargs = {"upper": upper, "left": left, "unitriangular": uni}
-            if requires_grad:
-                for grad_A, grad_B in product((True, False), repeat=2):
-                    # Either A or B needs to have a gradient
-                    if not grad_A and not grad_B:
-                        continue
-                    yield SampleInput(
-                        A.clone().requires_grad_(grad_A),
-                        args=(B.clone().requires_grad_(grad_B),),
-                        kwargs=kwargs)
+    for b, n, k, (left, upper, uni) in product(bs, ns, ks, product((True, False), repeat=3)):
+        with torch.no_grad():
+            if b == 1:
+                A = make_arg((n, n)) if left else make_arg((k, k))
+                B = make_arg((n, k))
             else:
-                yield SampleInput(A, args=(B,), kwargs=kwargs)
-
-    return list(gen_inputs())
+                A = make_arg((b, n, n)) if left else make_arg((b, k, k))
+                B = make_arg((b, n, k))
+            if uni:
+                # Not really necessary, but writing it for consistency
+                A.diagonal(0, -2, -1).fill_(1.)
+            else:
+                d = A.diagonal(0, -2, -1)
+                d[d.abs() < 1e-6] = 1.
+            if upper:
+                A.triu_()
+            else:
+                A.tril_()
+        kwargs = {"upper": upper, "left": left, "unitriangular": uni}
+        if requires_grad:
+            for grad_A, grad_B in product((True, False), repeat=2):
+                # Either A or B needs to have a gradient
+                if not grad_A and not grad_B:
+                    continue
+                yield SampleInput(
+                    A.clone().requires_grad_(grad_A),
+                    args=(B.clone().requires_grad_(grad_B),),
+                    kwargs=kwargs)
+        else:
+            yield SampleInput(A, args=(B,), kwargs=kwargs)
 
 def sample_inputs_legacy_solve(op_info, device, dtype, requires_grad=False, **kwargs):
     """
@@ -5446,14 +5305,11 @@ def sample_inputs_cholesky_solve(op_info, device, dtype, requires_grad=False, **
 
 def sample_inputs_lu(op_info, device, dtype, requires_grad=False, **kwargs):
     # not needed once OpInfo tests support Iterables
-    def generate_samples():
-        batch_shapes = ((), (3,), (3, 3))
-        for batch_shape, get_infos, size_delta in product(batch_shapes, (True, False), (-2, -1, 0, +1, +2)):
-            shape = batch_shape + (S + size_delta, S)
-            input = make_tensor(shape, device, dtype, requires_grad=requires_grad, low=None, high=None)
-            yield SampleInput(input, args=(True, get_infos))
-
-    return list(generate_samples())
+    batch_shapes = ((), (3,), (3, 3))
+    for batch_shape, get_infos, size_delta in product(batch_shapes, (True, False), (-2, -1, 0, +1, +2)):
+        shape = batch_shape + (S + size_delta, S)
+        input = make_tensor(shape, device, dtype, requires_grad=requires_grad, low=None, high=None)
+        yield SampleInput(input, args=(True, get_infos))
 
 
 def sample_inputs_lu_solve(op_info, device, dtype, requires_grad=False, **kwargs):
@@ -5463,61 +5319,52 @@ def sample_inputs_lu_solve(op_info, device, dtype, requires_grad=False, **kwargs
     ns = [5, 3, 0]
     nrhs = [0, 1, 6]
 
-    def generate_samples():
-        for n, batch, rhs in product(ns, batches, nrhs):
-            a = random_fullrank_matrix_distinct_singular_value(n, *batch, dtype=dtype, device=device)
-            requires_grad_options = (False,) if not requires_grad else (True, False)
-            # we try all possible combinations of requires_grad for each input
-            for lu_requires_grad, b_requires_grad in product(requires_grad_options, requires_grad_options):
-                # when requires_grad == True, at least one input has to have requires_grad enabled
-                if requires_grad and not lu_requires_grad and not b_requires_grad:
-                    continue
-                # we run LU several times to guarantee that the produced SampleInputs are independent
-                # this is especially important when setting different requries_grad for same tensors!
-                lu, pivs = a.lu()
-                lu.requires_grad = lu_requires_grad
-                b = torch.randn(*batch, n, rhs, dtype=dtype, device=device)
-                b.requires_grad = b_requires_grad
-                yield SampleInput(b, args=(lu, pivs))
-
-    return list(generate_samples())
+    for n, batch, rhs in product(ns, batches, nrhs):
+        a = random_fullrank_matrix_distinct_singular_value(n, *batch, dtype=dtype, device=device)
+        requires_grad_options = (False,) if not requires_grad else (True, False)
+        # we try all possible combinations of requires_grad for each input
+        for lu_requires_grad, b_requires_grad in product(requires_grad_options, requires_grad_options):
+            # when requires_grad == True, at least one input has to have requires_grad enabled
+            if requires_grad and not lu_requires_grad and not b_requires_grad:
+                continue
+            # we run LU several times to guarantee that the produced SampleInputs are independent
+            # this is especially important when setting different requries_grad for same tensors!
+            lu, pivs = a.lu()
+            lu.requires_grad = lu_requires_grad
+            b = torch.randn(*batch, n, rhs, dtype=dtype, device=device)
+            b.requires_grad = b_requires_grad
+            yield SampleInput(b, args=(lu, pivs))
 
 
 def sample_inputs_lu_unpack(op_info, device, dtype, requires_grad=False, **kwargs):
     # not needed once OpInfo tests support Iterables
-    def generate_samples():
-        for lu_sample in sample_inputs_lu(op_info, device, dtype, requires_grad, **kwargs):
-            lu_data, pivots = lu_sample.input.lu()
+    for lu_sample in sample_inputs_lu(op_info, device, dtype, requires_grad, **kwargs):
+        lu_data, pivots = lu_sample.input.lu()
+        yield SampleInput(lu_data, args=(pivots,))
+
+        # generate rectangular inputs
+        lu_data_shape = lu_data.shape
+        batch_shape = lu_data_shape[:-2]
+        n = lu_data_shape[-2]
+
+        for shape_inc in ((1, 0), (0, 1)):
+            lu_data, pivots = make_tensor(
+                batch_shape + (n + shape_inc[0], n + shape_inc[1]),
+                device, dtype,
+                requires_grad=False,
+                low=None, high=None
+            ).lu()
+            lu_data.requires_grad_(requires_grad)
             yield SampleInput(lu_data, args=(pivots,))
 
-            # generate rectangular inputs
-            lu_data_shape = lu_data.shape
-            batch_shape = lu_data_shape[:-2]
-            n = lu_data_shape[-2]
-
-            for shape_inc in ((1, 0), (0, 1)):
-                lu_data, pivots = make_tensor(
-                    batch_shape + (n + shape_inc[0], n + shape_inc[1]),
-                    device, dtype,
-                    requires_grad=False,
-                    low=None, high=None
-                ).lu()
-                lu_data.requires_grad_(requires_grad)
-                yield SampleInput(lu_data, args=(pivots,))
-
-    return list(generate_samples())
-
 
 def sample_inputs_roll(op_info, device, dtype, requires_grad=False, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
     args = ((0, 0), (1, 2), (0, 2), (2, 0), (-1, 0), (10000, 1), (2,), ((1, 2, -1), (0, 1, 2)))
 
-    def generator():
-        for arg in args:
-            yield SampleInput(make_arg((S, S, S)), args=arg)
-
-    return list(generator())
+    for arg in args:
+        yield SampleInput(make_arg((S, S, S)), args=arg)
 
 
 def sample_inputs_rot90(op_info, device, dtype, requires_grad=False, **kwargs):
@@ -5528,11 +5375,8 @@ def sample_inputs_rot90(op_info, device, dtype, requires_grad=False, **kwargs):
             (1, (1, -1),),
             ())
 
-    def generator():
-        for arg in args:
-            yield SampleInput(make_arg((S, S, S)), args=arg)
-
-    return list(generator())
+    for arg in args:
+        yield SampleInput(make_arg((S, S, S)), args=arg)
 
 
 def sample_inputs_std_var(op_info, device, dtype, requires_grad, **kwargs):
@@ -5673,11 +5517,8 @@ def sample_inputs_permute(op_info, device, dtype, requires_grad, **kwargs):
              ((), ()),
              ((1, 2, 3, 4), (2, 1, 3, 0))]
 
-    def generator():
-        for shape, args in cases:
-            yield SampleInput(make_arg(shape), args=(args,))
-
-    return list(generator())
+    for shape, args in cases:
+        yield SampleInput(make_arg(shape), args=(args,))
 
 
 # Based on erstwhile method_tests tests & some tensor_op_tests for pow
@@ -5866,11 +5707,8 @@ def sample_inputs_flip(op_info, device, dtype, requires_grad):
     sizes = ((S, M, S), (S, 0, M))
     all_dims = ((0, 1, 2), (0,), (0, 2), (-1,), ())
 
-    def gen_samples():
-        for size, dims in product(sizes, all_dims):
-            yield SampleInput(make_arg(size), kwargs={"dims": dims})
-
-    return list(gen_samples())
+    for size, dims in product(sizes, all_dims):
+        yield SampleInput(make_arg(size), kwargs={"dims": dims})
 
 def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad, **kwargs):
     tensors = (
@@ -5908,16 +5746,13 @@ def sample_inputs_fmod_remainder(op_info, device, dtype, requires_grad, *, autod
 
         samples = cases + cases_with_tensor_scalar + cases_with_broadcasting  # type: ignore[assignment]
 
-    def generator():
-        for shape, arg_other, broadcasts_input in samples:
-            if isinstance(arg_other, tuple):
-                arg = make_arg(arg_other, requires_grad=False, exclude_zero=True)
-            else:
-                # shape_other is scalar or torch.tensor
-                arg = arg_other
-            yield(SampleInput(make_arg(shape), args=(arg,), broadcasts_input=broadcasts_input))
-
-    return list(generator())
+    for shape, arg_other, broadcasts_input in samples:
+        if isinstance(arg_other, tuple):
+            arg = make_arg(arg_other, requires_grad=False, exclude_zero=True)
+        else:
+            # shape_other is scalar or torch.tensor
+            arg = arg_other
+        yield(SampleInput(make_arg(shape), args=(arg,), broadcasts_input=broadcasts_input))
 
 # TODO: clamp shares tensors among its sample inputs --- we should prohibit this!
 def sample_inputs_clamp(op_info, device, dtype, requires_grad, **kwargs):
@@ -5992,22 +5827,18 @@ def sample_inputs_cumprod(op_info, device, dtype, requires_grad, **kwargs):
             result.narrow(dim_select[0], 4, 1).narrow(dim_select[1], 3, 1).zero_()
         return result
 
-    # will not be needed once OpInfo tests suport Iterables
-    def sample_generator():
-        for dim in range(3):
-            yield SampleInput(make_arg((S, S, S)), args=(dim,))
-        # Scalar tensors and empty tensor
-        for size in [(), (1,), (0,)]:
-            yield SampleInput(make_arg(size), args=(0,))
+    for dim in range(3):
+        yield SampleInput(make_arg((S, S, S)), args=(dim,))
+    # Scalar tensors and empty tensor
+    for size in [(), (1,), (0,)]:
+        yield SampleInput(make_arg(size), args=(0,))
 
-        yield SampleInput(prod_zeros([0, 1]), args=(1,))
-        yield SampleInput(prod_zeros([0, 2]), args=(1,))
-        yield SampleInput(prod_zeros([1, 2]), args=(1,))
+    yield SampleInput(prod_zeros([0, 1]), args=(1,))
+    yield SampleInput(prod_zeros([0, 2]), args=(1,))
+    yield SampleInput(prod_zeros([1, 2]), args=(1,))
 
-        # test dtype kwarg
-        yield SampleInput(prod_zeros([1, 2]), args=(1,), kwargs={'dtype': dtype})
-
-    return list(sample_generator())
+    # test dtype kwarg
+    yield SampleInput(prod_zeros([1, 2]), args=(1,), kwargs={'dtype': dtype})
 
 def sample_inputs_view_as_complex(op_info, device, dtype, requires_grad, **kwargs):
     return [SampleInput(make_tensor((S, 2), device, dtype, requires_grad=requires_grad),)]
@@ -6042,17 +5873,14 @@ def sample_inputs_copysign(op_info, device, dtype, requires_grad, **kwargs):
     # broadcast all
     cases.append(((S, 1, S), (M, S), True))
 
-    def generator():
-        for input_shape, arg_val, broadcasts_input in cases:
-            if isinstance(arg_val, tuple):
-                arg = _make_tensor(*arg_val)
-            else:
-                # arg_val is scalar
-                arg = arg_val
+    for input_shape, arg_val, broadcasts_input in cases:
+        if isinstance(arg_val, tuple):
+            arg = _make_tensor(*arg_val)
+        else:
+            # arg_val is scalar
+            arg = arg_val
 
-            yield SampleInput(_make_tensor(*input_shape), args=(arg, ), broadcasts_input=broadcasts_input)
-
-    return list(generator())
+        yield SampleInput(_make_tensor(*input_shape), args=(arg, ), broadcasts_input=broadcasts_input)
 
 def sample_inputs_prod(op_info, device, dtype, requires_grad):
     def make_arg(shape):
@@ -6065,33 +5893,29 @@ def sample_inputs_prod(op_info, device, dtype, requires_grad):
             result[0, 1] = 0
         return result
 
-    # will not be needed once OpInfo tests support Iterables
-    def sample_generator():
-        for sample in sample_inputs_cumprod(op_info, device, dtype, requires_grad):
-            # only Tensor, ignore other inputs
-            yield SampleInput(sample.input.detach().clone().requires_grad_(requires_grad))
-            yield sample
+    for sample in sample_inputs_cumprod(op_info, device, dtype, requires_grad):
+        # only Tensor, ignore other inputs
+        yield SampleInput(sample.input.detach().clone().requires_grad_(requires_grad))
+        yield sample
 
-        # Generates samples with keepdim = True
-        for sample in sample_inputs_cumprod(op_info, device, dtype, requires_grad):
-            sample.kwargs['keepdim'] = True
-            yield sample
+    # Generates samples with keepdim = True
+    for sample in sample_inputs_cumprod(op_info, device, dtype, requires_grad):
+        sample.kwargs['keepdim'] = True
+        yield sample
 
-        yield SampleInput(prod_single_zero())
-        yield SampleInput(make_arg((3, 3, 3)), args=(1,))
-        yield SampleInput(make_arg((3, 3, 3)), args=(1,), kwargs={'keepdim': True})
+    yield SampleInput(prod_single_zero())
+    yield SampleInput(make_arg((3, 3, 3)), args=(1,))
+    yield SampleInput(make_arg((3, 3, 3)), args=(1,), kwargs={'keepdim': True})
 
-        # test zero scalar tensor
-        zero = make_arg(())
-        with torch.no_grad():
-            zero.zero_()
-        yield SampleInput(zero.detach().clone().requires_grad_(requires_grad))
-        yield SampleInput(zero.detach().clone().requires_grad_(requires_grad), args=(0,))
-        yield SampleInput(zero.detach().clone().requires_grad_(requires_grad),
-                          args=(0,),
-                          kwargs={'keepdim': True})
-
-    return list(sample_generator())
+    # test zero scalar tensor
+    zero = make_arg(())
+    with torch.no_grad():
+        zero.zero_()
+    yield SampleInput(zero.detach().clone().requires_grad_(requires_grad))
+    yield SampleInput(zero.detach().clone().requires_grad_(requires_grad), args=(0,))
+    yield SampleInput(zero.detach().clone().requires_grad_(requires_grad),
+                      args=(0,),
+                      kwargs={'keepdim': True})
 
 def error_inputs_neg(op_info, device, **kwargs):
     si = SampleInput(torch.tensor((False, True), device=device))
@@ -6109,11 +5933,8 @@ def sample_inputs_nextafter(op_info, device, dtype, requires_grad, **kwargs):
         ((S, ), (S, S), True)
     )
 
-    def generator():
-        for shape, other_shape, broadcasts_input in cases:
-            yield SampleInput(make_arg(shape), args=(make_arg(other_shape),), broadcasts_input=broadcasts_input)
-
-    return list(generator())
+    for shape, other_shape, broadcasts_input in cases:
+        yield SampleInput(make_arg(shape), args=(make_arg(other_shape),), broadcasts_input=broadcasts_input)
 
 
 def sample_inputs_diag(op_info, device, dtype, requires_grad, **kwargs):
@@ -6145,11 +5966,8 @@ def sample_inputs_diagonal_diag_embed(op_info, device, dtype, requires_grad, **k
     args_2d = ((), (2,), (-2,), (1,))
     args_3d = ((1, 1, 2), (2, 0, 1), (-2, 0, 1))
 
-    def generator():
-        for shape, arg in chain(product(shapes_2d, args_2d), product(shapes_3d, args_3d)):
-            yield SampleInput(make_arg(shape), args=arg)
-
-    return list(generator())
+    for shape, arg in chain(product(shapes_2d, args_2d), product(shapes_3d, args_3d)):
+        yield SampleInput(make_arg(shape), args=arg)
 
 
 def sample_inputs_diagonal_scatter(op_info, device, dtype, requires_grad, **kwargs):
@@ -6164,20 +5982,17 @@ def sample_inputs_diagonal_scatter(op_info, device, dtype, requires_grad, **kwar
     args_2d = ((), (2,), (-2,), (1,))
     args_3d = ((1, 1, 2), (2, 0, 1), (-2, 0, 1))
 
-    def generator():
-        for input_shape, arg in chain(product(shapes_2d, args_2d), product(shapes_3d, args_3d)):
-            input_ = make_arg(input_shape)
-            # We can programatically figure out the right shape for src:
-            # It should be the same size as input.diagonal(other_args...)
-            if not isinstance(arg, tuple):
-                arg_tuple = (arg,)
-            else:
-                arg_tuple = arg
-            src_shape = input_.diagonal(*arg_tuple).size()
-            src = make_arg(src_shape)
-            yield SampleInput(input_, args=(src, *arg_tuple))
-
-    return list(generator())
+    for input_shape, arg in chain(product(shapes_2d, args_2d), product(shapes_3d, args_3d)):
+        input_ = make_arg(input_shape)
+        # We can programatically figure out the right shape for src:
+        # It should be the same size as input.diagonal(other_args...)
+        if not isinstance(arg, tuple):
+            arg_tuple = (arg,)
+        else:
+            arg_tuple = arg
+        src_shape = input_.diagonal(*arg_tuple).size()
+        src = make_arg(src_shape)
+        yield SampleInput(input_, args=(src, *arg_tuple))
 
 
 def sample_inputs_to_sparse(op_info, device, dtype, requires_grad, **kwargs):
@@ -6490,11 +6305,8 @@ def sample_inputs_polygamma(op_info, device, dtype, requires_grad, **kwargs):
     tensor_shapes = ((S, S), ())
     ns = (1, 2, 3, 4, 5)
 
-    def generator():
-        for shape, n in product(tensor_shapes, ns):
-            yield SampleInput(make_arg(shape), args=(n,))
-
-    return list(generator())
+    for shape, n in product(tensor_shapes, ns):
+        yield SampleInput(make_arg(shape), args=(n,))
 
 
 def sample_inputs_mvlgamma(op_info, device, dtype, requires_grad, **kwargs):
@@ -6509,15 +6321,12 @@ def sample_inputs_mvlgamma(op_info, device, dtype, requires_grad, **kwargs):
     def compute_min_val(p):
         return (p - 1.) / 2
 
-    def generator():
-        for shape, n in product(tensor_shapes, ns):
-            min_val = compute_min_val(n)
-            if not dtype.is_floating_point:
-                # Round-up minimum value for integral dtypes
-                min_val += 1
-            yield SampleInput(make_arg(shape, low=min_val), args=(n,))
-
-    return list(generator())
+    for shape, n in product(tensor_shapes, ns):
+        min_val = compute_min_val(n)
+        if not dtype.is_floating_point:
+            # Round-up minimum value for integral dtypes
+            min_val += 1
+        yield SampleInput(make_arg(shape, low=min_val), args=(n,))
 
 
 # Since `mvlgamma` has multiple entries,
@@ -6763,12 +6572,9 @@ def sample_inputs_atan2(op_info, device, dtype, requires_grad, **kwargs):
         ((S, 1, S), (S, S), True),
     )
 
-    def generator():
-        for x_shape, y_shape, broadcasts_input in cases:
-            yield SampleInput(make_arg(x_shape), args=(make_arg(y_shape),),
-                              broadcasts_input=broadcasts_input)
-
-    return list(generator())
+    for x_shape, y_shape, broadcasts_input in cases:
+        yield SampleInput(make_arg(x_shape), args=(make_arg(y_shape),),
+                          broadcasts_input=broadcasts_input)
 
 
 def sample_inputs_split(op_info, device, dtype, requires_grad, *, list_args=False, **kwargs):
@@ -6786,11 +6592,8 @@ def sample_inputs_split(op_info, device, dtype, requires_grad, *, list_args=Fals
             ((S, S, S), (S, 1)),
         )
 
-    def generator():
-        for shape, args in cases:
-            yield SampleInput(make_arg(shape), args=args)
-
-    return list(generator())
+    for shape, args in cases:
+        yield SampleInput(make_arg(shape), args=args)
 
 
 def sample_inputs_split_with_sizes(op_info, device, dtype, requires_grad, **kwargs):
@@ -6802,11 +6605,8 @@ def sample_inputs_split_with_sizes(op_info, device, dtype, requires_grad, **kwar
              ((S, S, S), ([int(S / 3), S - int(S / 3) * 2, int(S / 3)], -2)),
              )
 
-    def generator():
-        for shape, args in cases:
-            yield SampleInput(make_arg(shape), args=args)
-
-    return list(generator())
+    for shape, args in cases:
+        yield SampleInput(make_arg(shape), args=args)
 
 
 def sample_inputs_msort(op_info, device, dtype, requires_grad):
@@ -7003,30 +6803,20 @@ def sample_inputs_tril_triu(op_info, device, dtype, requires_grad, **kwargs):
              ((S, M, M), (2,)),
              ((3, 3, S, S), ()),)
 
-    def generator():
-        for shape, args in cases:
-            yield SampleInput(make_arg(shape), args=args)
-
-    return list(generator())
+    for shape, args in cases:
+        yield SampleInput(make_arg(shape), args=args)
 
 
 def sample_inputs_clone(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
 
-    def generator():
-        yield SampleInput(make_arg((S, M, S)))
-        yield SampleInput(make_arg(()))
-
-    return list(generator())
+    yield SampleInput(make_arg((S, M, S)))
+    yield SampleInput(make_arg(()))
 
 
 def sample_inputs_contiguous(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
-
-    def generator():
-        yield SampleInput(make_arg((S, S)))
-
-    return list(generator())
+    yield SampleInput(make_arg((S, S)))
 
 
 def sample_inputs_sum_to_size(op_info, device, dtype, requires_grad, **kwargs):
@@ -7057,21 +6847,18 @@ def sample_inputs_resize_ops(op_info, device, dtype, requires_grad, **kwargs):
              ((), (1, 1, 1)),
              )
 
-    def generator():
-        for shape, args_or_shape in cases:
-            # Update `args` based on operator
-            if op_info.name == 'resize_':
-                # resize_ takes shape/tuple of ints,
-                args = (args_or_shape, )
-            elif op_info.name == 'resize_as_':
-                # resize_as_ takes another tensor
-                args = (make_arg(shape, requires_grad=False), )  # type:ignore[assignment]
-            else:
-                raise ValueError("sample_inputs_resize_ops is being used with incorrect operator")
+    for shape, args_or_shape in cases:
+        # Update `args` based on operator
+        if op_info.name == 'resize_':
+            # resize_ takes shape/tuple of ints,
+            args = (args_or_shape, )
+        elif op_info.name == 'resize_as_':
+            # resize_as_ takes another tensor
+            args = (make_arg(shape, requires_grad=False), )  # type:ignore[assignment]
+        else:
+            raise ValueError("sample_inputs_resize_ops is being used with incorrect operator")
 
-            yield(SampleInput(make_arg(shape, requires_grad=requires_grad), args=args))
-
-    return list(generator())
+        yield(SampleInput(make_arg(shape, requires_grad=requires_grad), args=args))
 
 def sample_inputs_view_reshape(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -7084,18 +6871,15 @@ def sample_inputs_view_reshape(op_info, device, dtype, requires_grad, **kwargs):
              ((), ()),
              ((), (1,)))
 
-    def generator():
-        for case in cases:
-            shape, args = case
-            inp = make_arg(shape, requires_grad=requires_grad)
-            yield(SampleInput(inp, args=(args, )))
+    for case in cases:
+        shape, args = case
+        inp = make_arg(shape, requires_grad=requires_grad)
+        yield(SampleInput(inp, args=(args, )))
 
-            if op_info.name != "view" and len(shape) >= 2:
-                yield(SampleInput(
-                    inp.detach().clone().transpose(0, 1).requires_grad_(requires_grad),
-                    args=(args, )))
-
-    return list(generator())
+        if op_info.name != "view" and len(shape) >= 2:
+            yield(SampleInput(
+                inp.detach().clone().transpose(0, 1).requires_grad_(requires_grad),
+                args=(args, )))
 
 def sample_inputs_view_as_reshape_as(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device)
@@ -7105,18 +6889,15 @@ def sample_inputs_view_as_reshape_as(op_info, device, dtype, requires_grad, **kw
              ((), (1, 1)),
              )
 
-    def generator():
-        for case in cases:
-            shape, shape_other = case
-            inp = make_arg(shape, requires_grad=requires_grad)
-            yield(SampleInput(inp, args=(make_arg(shape_other, requires_grad=False),)))
+    for case in cases:
+        shape, shape_other = case
+        inp = make_arg(shape, requires_grad=requires_grad)
+        yield(SampleInput(inp, args=(make_arg(shape_other, requires_grad=False),)))
 
-            if op_info.name != "view_as" and len(shape) >= 2:
-                yield(SampleInput(
-                    inp.detach().clone().transpose(0, 1).requires_grad_(requires_grad),
-                    args=(make_arg(shape_other, requires_grad=False),)))
-
-    return list(generator())
+        if op_info.name != "view_as" and len(shape) >= 2:
+            yield(SampleInput(
+                inp.detach().clone().transpose(0, 1).requires_grad_(requires_grad),
+                args=(make_arg(shape_other, requires_grad=False),)))
 
 def sample_inputs_atleast1d2d3d(op_info, device, dtype, requires_grad, **kwargs):
     input_list = []
@@ -7161,11 +6942,8 @@ def sample_inputs_select(op_info, device, dtype, requires_grad, **kwargs):
              ((S,), (0, 2))
              )
 
-    def generator():
-        for shape, args in cases:
-            yield SampleInput(make_arg(shape), args=args)
-
-    return list(generator())
+    for shape, args in cases:
+        yield SampleInput(make_arg(shape), args=args)
 
 
 def sample_inputs_select_scatter(op_info, device, dtype, requires_grad, **kwargs):
@@ -7178,13 +6956,10 @@ def sample_inputs_select_scatter(op_info, device, dtype, requires_grad, **kwargs
              ((S,), (), (0, 2))
              )
 
-    def generator():
-        for input_shape, src_shape, args in cases:
-            input_ = make_arg(input_shape)
-            src = make_arg(src_shape)
-            yield SampleInput(input_, args=(src, *args))
-
-    return list(generator())
+    for input_shape, src_shape, args in cases:
+        input_ = make_arg(input_shape)
+        src = make_arg(src_shape)
+        yield SampleInput(input_, args=(src, *args))
 
 
 def sample_inputs_slice_scatter(op_info, device, dtype, requires_grad, **kwargs):
@@ -7201,13 +6976,10 @@ def sample_inputs_slice_scatter(op_info, device, dtype, requires_grad, **kwargs)
              ((L, L, L), (L, L, L // 4,), (2, L // 2, L, 2)),
              )
 
-    def generator():
-        for input_shape, src_shape, args in cases:
-            input_ = make_arg(input_shape)
-            src = make_arg(src_shape)
-            yield SampleInput(input_, args=(src, *args))
-
-    return list(generator())
+    for input_shape, src_shape, args in cases:
+        input_ = make_arg(input_shape)
+        src = make_arg(src_shape)
+        yield SampleInput(input_, args=(src, *args))
 
 
 def sample_inputs_rbinops(op_info, device, dtype, requires_grad, supports_dtype_kwargs=True, **kwargs):
@@ -7243,12 +7015,9 @@ def sample_inputs_expand(op_info, device, dtype, requires_grad, **kwargs):
              ((), (1, 3, 2)),
              )
 
-    def generator():
-        for case in cases:
-            shape, args = case
-            yield(SampleInput(make_arg(shape), args=(args, )))
-
-    return list(generator())
+    for case in cases:
+        shape, args = case
+        yield(SampleInput(make_arg(shape), args=(args, )))
 
 def sample_inputs_conversion(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -7257,11 +7026,9 @@ def sample_inputs_conversion(op_info, device, dtype, requires_grad, **kwargs):
               (2, 3))
     memory_format_options = [None, torch.contiguous_format]
 
-    def generator():
-        for shape, memory_format in itertools.product(shapes, memory_format_options):
-            yield SampleInput(make_arg(shape),
-                              kwargs={'memory_format': memory_format} if memory_format else {})
-    return list(generator())
+    for shape, memory_format in itertools.product(shapes, memory_format_options):
+        yield SampleInput(make_arg(shape),
+                          kwargs={'memory_format': memory_format} if memory_format else {})
 
 def sample_inputs_conversion_channels_last(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -7279,12 +7046,9 @@ def sample_inputs_expand_as(op_info, device, dtype, requires_grad, **kwargs):
              ((), (1, 1)),
              )
 
-    def generator():
-        for shape, shape_other in cases:
-            yield(SampleInput(make_arg(shape, requires_grad=requires_grad),
-                              args=(make_arg(shape_other, requires_grad=False), )))
-
-    return list(generator())
+    for shape, shape_other in cases:
+        yield(SampleInput(make_arg(shape, requires_grad=requires_grad),
+                          args=(make_arg(shape_other, requires_grad=False), )))
 
 
 def sample_inputs_where(op_info, device, dtype, requires_grad, **kwargs):
@@ -7316,13 +7080,10 @@ def sample_inputs_where(op_info, device, dtype, requires_grad, **kwargs):
              ((M, 1, M), (), (M, M, 1), True),
              ((), (M, M), (), True),)
 
-    def generator():
-        for shape, mask_shape, other_shape, broadcasts_input in cases:
-            yield SampleInput(make_arg(shape),
-                              args=(make_bool_mask(mask_shape), make_arg(other_shape)),
-                              broadcasts_input=broadcasts_input)
-
-    return list(generator())
+    for shape, mask_shape, other_shape, broadcasts_input in cases:
+        yield SampleInput(make_arg(shape),
+                          args=(make_bool_mask(mask_shape), make_arg(other_shape)),
+                          broadcasts_input=broadcasts_input)
 
 def sample_inputs_nonzero(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -7341,12 +7102,9 @@ def sample_inputs_nonzero(op_info, device, dtype, requires_grad, **kwargs):
         mixed[mask_t] = 0
         inputs.append(mixed)
 
-    def generator():
-        for input_t, as_tuple in product(inputs, [False, True]):
-            yield(SampleInput(input_t.detach().clone().requires_grad_(requires_grad),
-                              kwargs=dict(as_tuple=as_tuple)))
-
-    return list(generator())
+    for input_t, as_tuple in product(inputs, [False, True]):
+        yield(SampleInput(input_t.detach().clone().requires_grad_(requires_grad),
+                          kwargs=dict(as_tuple=as_tuple)))
 
 def sample_inputs_chunk(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device)
@@ -7355,12 +7113,9 @@ def sample_inputs_chunk(op_info, device, dtype, requires_grad, **kwargs):
              ((S, S, S), (S, 1)),
              ((S, S, S), (S, -1)))
 
-    def generator():
-        for case in cases:
-            shape, args = case
-            yield(SampleInput(make_arg(shape, requires_grad=requires_grad), args=args))
-
-    return list(generator())
+    for case in cases:
+        shape, args = case
+        yield(SampleInput(make_arg(shape, requires_grad=requires_grad), args=args))
 
 def sample_inputs_kthvalue(op_info, device, dtype, requires_grad, **kwargs):
     def _tensor(shape, dtype=dtype, low=None, high=None):
@@ -7561,66 +7316,63 @@ def sample_inputs_embedding(op_info, device, dtype, requires_grad, **kwargs):
     def make_long_input(shape, *, low, high):
         return make_tensor(shape, device=device, dtype=torch.long, low=low, high=high)
 
-    def generator():
-        # 0-D index tensor
-        idx = make_long_input((), low=0, high=M)
-        yield SampleInput(make_input((M, S)), args=(idx,),)
+    # 0-D index tensor
+    idx = make_long_input((), low=0, high=M)
+    yield SampleInput(make_input((M, S)), args=(idx,),)
 
-        # 1-D index tensor
-        idx = make_long_input((S,), low=0, high=M)
-        yield SampleInput(make_input((M, S)), args=(idx,),)
+    # 1-D index tensor
+    idx = make_long_input((S,), low=0, high=M)
+    yield SampleInput(make_input((M, S)), args=(idx,),)
 
-        # 2-D index tensor
-        idx = make_long_input((S, S), low=0, high=M)
-        yield SampleInput(make_input((M, S)), args=(idx,),)
+    # 2-D index tensor
+    idx = make_long_input((S, S), low=0, high=M)
+    yield SampleInput(make_input((M, S)), args=(idx,),)
 
-        if not requires_grad:
-            # Following inputs return different gradient from the numerical gradient.
-            # This is expected and relevant tests are present in `test_nn.py`.
+    if not requires_grad:
+        # Following inputs return different gradient from the numerical gradient.
+        # This is expected and relevant tests are present in `test_nn.py`.
 
-            # The gradient vector at `padding_idx` is not updated.
-            idx = make_long_input((2, 2), low=0, high=S)
-            idx[0, 0] = 2
-            idx[1, 1] = 2
-            yield SampleInput(make_input((S, S)), args=(idx,), kwargs={'padding_idx': 2},)
+        # The gradient vector at `padding_idx` is not updated.
+        idx = make_long_input((2, 2), low=0, high=S)
+        idx[0, 0] = 2
+        idx[1, 1] = 2
+        yield SampleInput(make_input((S, S)), args=(idx,), kwargs={'padding_idx': 2},)
 
-            idx = make_long_input((2, 2), low=0, high=S)
-            idx[0, 0] = 4
-            idx[1, 1] = 4
-            yield SampleInput(make_input((S, S)), args=(idx,), kwargs={'padding_idx': -1},)
+        idx = make_long_input((2, 2), low=0, high=S)
+        idx[0, 0] = 4
+        idx[1, 1] = 4
+        yield SampleInput(make_input((S, S)), args=(idx,), kwargs={'padding_idx': -1},)
 
-            # Due to inplace renorming of weight, the numerical gradient doesn't match the
-            # analytical gradient.
-            idx = make_long_input((2, 2), low=0, high=S)
-            weights = make_input((S, S)) * 2
-            yield SampleInput(weights, args=(idx,), kwargs={'max_norm': 1.},)
+        # Due to inplace renorming of weight, the numerical gradient doesn't match the
+        # analytical gradient.
+        idx = make_long_input((2, 2), low=0, high=S)
+        weights = make_input((S, S)) * 2
+        yield SampleInput(weights, args=(idx,), kwargs={'max_norm': 1.},)
 
-            idx = make_long_input((2, 2), low=0, high=S)
-            weights = make_input((S, S)) * 2
-            yield SampleInput(weights, args=(idx,), kwargs={'max_norm': 1., 'norm_type': 1.0},)
+        idx = make_long_input((2, 2), low=0, high=S)
+        weights = make_input((S, S)) * 2
+        yield SampleInput(weights, args=(idx,), kwargs={'max_norm': 1., 'norm_type': 1.0},)
 
-            # Scale the gradient based on the inverse frequency of a particular index.
-            idx = make_long_input((2, 2), low=0, high=S)
-            idx[0, 0] = 1
-            idx[0, 1] = 1
-            weights = make_input((S, S))
-            yield SampleInput(weights, args=(idx,), kwargs={'scale_grad_by_freq': True},)
+        # Scale the gradient based on the inverse frequency of a particular index.
+        idx = make_long_input((2, 2), low=0, high=S)
+        idx[0, 0] = 1
+        idx[0, 1] = 1
+        weights = make_input((S, S))
+        yield SampleInput(weights, args=(idx,), kwargs={'scale_grad_by_freq': True},)
 
-            # gradcheck not implemented for sparse tensors.
-            idx = make_long_input((2, 2), low=0, high=S)
-            weights = make_input((S, S))
-            yield SampleInput(weights, args=(idx,), kwargs={'sparse': True})
+        # gradcheck not implemented for sparse tensors.
+        idx = make_long_input((2, 2), low=0, high=S)
+        weights = make_input((S, S))
+        yield SampleInput(weights, args=(idx,), kwargs={'sparse': True})
 
-            idx = make_long_input((3, 3), low=0, high=S)
-            idx[0, 0] = 1  # freq more than 1
-            idx[0, 1] = 1  # freq more than 1
-            idx[1, 0] = 0  # padding_idx
-            weights = make_input((S, S)) * 2
-            yield SampleInput(weights, args=(idx,),
-                              kwargs={'sparse': True, 'scale_grad_by_freq': True,
-                                      'padding_idx': 0, 'max_norm': 1.})
-
-    return list(generator())
+        idx = make_long_input((3, 3), low=0, high=S)
+        idx[0, 0] = 1  # freq more than 1
+        idx[0, 1] = 1  # freq more than 1
+        idx[1, 0] = 0  # padding_idx
+        weights = make_input((S, S)) * 2
+        yield SampleInput(weights, args=(idx,),
+                          kwargs={'sparse': True, 'scale_grad_by_freq': True,
+                                  'padding_idx': 0, 'max_norm': 1.})
 
 
 def sample_inputs_one_hot(op_info, device, dtype, requires_grad, **kwargs):
@@ -7677,13 +7429,10 @@ def sample_inputs_tensorsolve(op_info, device, dtype, requires_grad, **kwargs):
     # a_shapes += [(0, 0, 1, 2, 3, 0)]
     dimss = [None, (0, 2)]
 
-    def gen_inputs():
-        for a_shape, dims in itertools.product(a_shapes, dimss):
-            a = make_tensor(a_shape, dtype=dtype, device=device, requires_grad=requires_grad)
-            b = make_tensor(a_shape[:2], dtype=dtype, device=device, requires_grad=requires_grad)
-            yield SampleInput(a, args=(b,), kwargs=dict(dims=dims))
-
-    return list(gen_inputs())
+    for a_shape, dims in itertools.product(a_shapes, dimss):
+        a = make_tensor(a_shape, dtype=dtype, device=device, requires_grad=requires_grad)
+        b = make_tensor(a_shape[:2], dtype=dtype, device=device, requires_grad=requires_grad)
+        yield SampleInput(a, args=(b,), kwargs=dict(dims=dims))
 
 def sample_inputs_mse_loss(op_info, device, dtype, requires_grad, **kwargs):
     _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -7742,17 +7491,14 @@ def sample_inputs_cosine_embedding_loss(op_info, device, dtype, requires_grad, *
         target = t.to(dtype=dtype).detach()
         return target
 
-    def gen_inputs():
-        shapes = ((S, S), (S,))
-        reductions = ('none', 'mean', 'sum')
-        for s, r in product(shapes, reductions):
-            yield SampleInput(
-                make_input(s),
-                args=(make_input(s), make_target(s)),
-                kwargs=dict(reduction=r, margin=random.uniform(-1, 1))
-            )
-
-    return list(gen_inputs())
+    shapes = ((S, S), (S,))
+    reductions = ('none', 'mean', 'sum')
+    for s, r in product(shapes, reductions):
+        yield SampleInput(
+            make_input(s),
+            args=(make_input(s), make_target(s)),
+            kwargs=dict(reduction=r, margin=random.uniform(-1, 1))
+        )
 
 def sample_inputs_ctc_loss(op_info, device, dtype, requires_grad, **kwargs):
     input_length = 50
@@ -7765,18 +7511,15 @@ def sample_inputs_ctc_loss(op_info, device, dtype, requires_grad, **kwargs):
         log_probs = t.log_softmax(2).to(device=device, dtype=dtype).detach().requires_grad_(requires_grad=requires_grad)
         return log_probs
 
-    def gen_inputs():
-        reductions = ('none', 'mean', 'sum')
-        zero_inf = (True, False)
-        for r, z in product(reductions, zero_inf):
-            log_probs = make_log_probs((input_length, batch, num_char))
-            targets = torch.randint(1, num_char, (batch, target_length), dtype=torch.long, device=device)
-            input_lengths = torch.full((batch, ), input_length, dtype=torch.long, device=device)
-            target_lengths = torch.randint(10, target_length, (batch, ), dtype=torch.long, device=device)
+    reductions = ('none', 'mean', 'sum')
+    zero_inf = (True, False)
+    for r, z in product(reductions, zero_inf):
+        log_probs = make_log_probs((input_length, batch, num_char))
+        targets = torch.randint(1, num_char, (batch, target_length), dtype=torch.long, device=device)
+        input_lengths = torch.full((batch, ), input_length, dtype=torch.long, device=device)
+        target_lengths = torch.randint(10, target_length, (batch, ), dtype=torch.long, device=device)
 
-            yield SampleInput(log_probs, args=(targets, input_lengths, target_lengths,), kwargs=dict(reduction=r, zero_infinity=z))
-
-    return list(gen_inputs())
+        yield SampleInput(log_probs, args=(targets, input_lengths, target_lengths,), kwargs=dict(reduction=r, zero_infinity=z))
 
 def sample_inputs_nll_loss(op_info, device, dtype, requires_grad, **kwargs):
     shape = (2, 3)
@@ -7816,38 +7559,32 @@ def sample_inputs_nll_loss(op_info, device, dtype, requires_grad, **kwargs):
             if reduction != "mean":
                 yield make_input(s), make_target(s, zeros=True), dict(ignore_index=0, reduction=reduction)
 
-    def gen_inputs():
-        for input, target, kwargs in gen_shape_kwargs():
-            yield SampleInput(input, args=(target,), kwargs=kwargs)
-
-    return list(gen_inputs())
+    for input, target, kwargs in gen_shape_kwargs():
+        yield SampleInput(input, args=(target,), kwargs=kwargs)
 
 def sample_inputs_argwhere(op_info, device, dtype, requires_grad, **kwargs):
-    def generator():
-        yield SampleInput(torch.tensor([1, 0, 2, 0], dtype=dtype, device=device, requires_grad=requires_grad))
+    yield SampleInput(torch.tensor([1, 0, 2, 0], dtype=dtype, device=device, requires_grad=requires_grad))
 
-        mask = torch.tensor([[0, 1, 0, 1, 0],
-                             [1, 1, 1, 1, 0],
-                             [0, 0, 0, 1, 0],
-                             [1, 0, 1, 1, 0],
-                             [1, 0, 0, 1, 0]], dtype=torch.bool, device=device)
-        t = make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad)
-        with torch.no_grad():
-            t[mask] = 0
-        yield SampleInput(t)
+    mask = torch.tensor([[0, 1, 0, 1, 0],
+                         [1, 1, 1, 1, 0],
+                         [0, 0, 0, 1, 0],
+                         [1, 0, 1, 1, 0],
+                         [1, 0, 0, 1, 0]], dtype=torch.bool, device=device)
+    t = make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad)
+    with torch.no_grad():
+        t[mask] = 0
+    yield SampleInput(t)
 
-        t = make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad, noncontiguous=True)
-        with torch.no_grad():
-            t[mask] = 0
-        yield SampleInput(t)
+    t = make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad, noncontiguous=True)
+    with torch.no_grad():
+        t[mask] = 0
+    yield SampleInput(t)
 
-        t = make_tensor((S, 0), dtype=dtype, device=device, requires_grad=requires_grad)
-        yield SampleInput(t)
+    t = make_tensor((S, 0), dtype=dtype, device=device, requires_grad=requires_grad)
+    yield SampleInput(t)
 
-        yield SampleInput(torch.zeros((S,), dtype=dtype, device=device, requires_grad=requires_grad))
-        yield SampleInput(make_tensor((), dtype=dtype, device=device, requires_grad=requires_grad))
-
-    return list(generator())
+    yield SampleInput(torch.zeros((S,), dtype=dtype, device=device, requires_grad=requires_grad))
+    yield SampleInput(make_tensor((), dtype=dtype, device=device, requires_grad=requires_grad))
 
 def _generate_sample_shape_reduction():
     shapes = ((S,), (S, S), (S, S, S))
@@ -7857,7 +7594,8 @@ def _generate_sample_shape_reduction():
 
 def sample_inputs_gaussian_nll_loss(op_info, device, dtype, requires_grad, **kwargs):
     _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
-    make_var = partial(make_tensor, low=0, device=device, dtype=dtype, requires_grad=requires_grad)
+    # Set low slightly above 0 so gradcheck doesn't accidentally dip below 0
+    make_var = partial(make_tensor, low=0.1, device=device, dtype=dtype, requires_grad=requires_grad)
 
     def gen_shape(shape):
         yield shape
@@ -7882,11 +7620,8 @@ def sample_inputs_gaussian_nll_loss(op_info, device, dtype, requires_grad, **kwa
                     dict(full=True, eps=random.uniform(1e-6, 1e-3), reduction=r)
                 )
 
-    def gen_inputs():
-        for input, target, var, kwargs in gen_shape_kwargs():
-            yield SampleInput(input, args=(target, var, ), kwargs=kwargs)
-
-    return list(gen_inputs())
+    for input, target, var, kwargs in gen_shape_kwargs():
+        yield SampleInput(input, args=(target, var, ), kwargs=kwargs)
 
 def _generate_sample_inputs_nn_loss(op_info, device, dtype, requires_grad, **kwargs):
     _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -7895,20 +7630,14 @@ def _generate_sample_inputs_nn_loss(op_info, device, dtype, requires_grad, **kwa
         yield _make_tensor(s), _make_tensor(s), dict(reduction=r)
 
 def sample_inputs_hinge_embedding_loss(op_info, device, dtype, requires_grad, **kwargs):
-    def gen_inputs():
-        for input, target, d in _generate_sample_inputs_nn_loss(op_info, device, dtype, requires_grad, **kwargs):
-            d['margin'] = random.uniform(-9, 9)
-            yield SampleInput(input, args=(target, ), kwargs=d)
-
-    return list(gen_inputs())
+    for input, target, d in _generate_sample_inputs_nn_loss(op_info, device, dtype, requires_grad, **kwargs):
+        d['margin'] = random.uniform(-9, 9)
+        yield SampleInput(input, args=(target, ), kwargs=d)
 
 def sample_inputs_huber_loss(op_info, device, dtype, requires_grad, **kwargs):
-    def gen_inputs():
-        for input, target, d in _generate_sample_inputs_nn_loss(op_info, device, dtype, requires_grad, **kwargs):
-            d['delta'] = random.uniform(1e-3, 9)
-            yield SampleInput(input, args=(target, ), kwargs=d)
-
-    return list(gen_inputs())
+    for input, target, d in _generate_sample_inputs_nn_loss(op_info, device, dtype, requires_grad, **kwargs):
+        d['delta'] = random.uniform(1e-3, 9)
+        yield SampleInput(input, args=(target, ), kwargs=d)
 
 def sample_inputs_poisson_nll_loss(op_info, device, dtype, requires_grad, **kwargs):
     _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -7928,11 +7657,8 @@ def sample_inputs_poisson_nll_loss(op_info, device, dtype, requires_grad, **kwar
                              reduction=r)
                     )
 
-    def gen_inputs():
-        for input, target, kwargs in gen_shape_kwargs():
-            yield SampleInput(input, args=(target, ), kwargs=kwargs)
-
-    return list(gen_inputs())
+    for input, target, kwargs in gen_shape_kwargs():
+        yield SampleInput(input, args=(target, ), kwargs=kwargs)
 
 def sample_inputs_pairwise_distance(op_info, device, dtype, requires_grad, **kwargs):
     make = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -9035,13 +8761,7 @@ op_db: List[OpInfo] = [
            supports_out=False,
            supports_gradgrad=False,
            assert_autodiffed=False,
-           sample_inputs_func=sample_inputs_cdist,
-           skips=(
-               # RuntimeError: _cdist_backward requires X1 to be contiguous
-               DecorateInfo(unittest.skip("_cdist_backward requires X1 to be contiguous"),
-                            'TestCommon', 'test_noncontiguous_samples'),
-           )
-           ),
+           sample_inputs_func=sample_inputs_cdist),
     UnaryUfuncInfo('ceil',
                    ref=np.ceil,
                    dtypes=floating_types_and(torch.bfloat16),
@@ -9860,6 +9580,9 @@ op_db: List[OpInfo] = [
                    skips=(
                        # Dispatch stub: unsupported device typemeta
                        DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad', device_type='meta'),
+                       # (ROCm) Memory access fault by GPU node-4 (Agent handle: 0x55cebc9e8430) on address 0x7fa17b757000
+                       DecorateInfo(unittest.skip("Skipped! ROCm memory exception"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
+                                    device_type='cuda', dtypes=[torch.float64], active_if=TEST_WITH_ROCM),
                    )),
     BinaryUfuncInfo('floor_divide',
                     dtypes=all_types_and(torch.half, torch.bfloat16),
@@ -9939,6 +9662,9 @@ op_db: List[OpInfo] = [
                DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32, torch.complex64)),  # noqa: B950
                # 69925: RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!
                DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad', device_type='cuda'),
+               # (ROCm) Memory exception on virtual address 0x7f6f3deb7000, node id 4: Page not present
+               DecorateInfo(unittest.skip("Skipped! ROCm memory exception"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
+                            device_type='cuda', dtypes=[torch.float64, torch.complex128], active_if=TEST_WITH_ROCM),
            ),
            supports_inplace_autograd=False,
            sample_inputs_func=sample_inputs_gradient),
@@ -10519,6 +10245,9 @@ op_db: List[OpInfo] = [
            skips=(
                # 69855: RuntimeError: ZeroTensors are immutable. Please use the materialized zero tensor (...)
                DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
+               # (ROCm) unexpected success
+               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
+                            device_type='cuda', dtypes=[torch.float64], active_if=TEST_WITH_ROCM),
            )),
     OpInfo('median',
            dtypes=all_types_and(torch.bfloat16),
@@ -10531,6 +10260,9 @@ op_db: List[OpInfo] = [
            skips=(
                # 69855: RuntimeError: ZeroTensors are immutable. Please use the materialized zero tensor (...)
                DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
+               # (ROCm) unexpected success
+               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
+                            device_type='cuda', dtypes=[torch.float64], active_if=TEST_WITH_ROCM),
            )),
     OpInfo('nanmedian',
            dtypes=all_types_and(torch.bfloat16),
@@ -10543,6 +10275,9 @@ op_db: List[OpInfo] = [
            skips=(
                # 69855: RuntimeError: ZeroTensors are immutable. Please use the materialized zero tensor (...)
                DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
+               # (ROCm) unexpected success
+               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
+                            device_type='cuda', dtypes=[torch.float64], active_if=TEST_WITH_ROCM),
            )),
     OpInfo('var_mean',
            dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
@@ -10647,6 +10382,9 @@ op_db: List[OpInfo] = [
            skips=(
                # 69855: RuntimeError: ZeroTensors are immutable. Please use the materialized zero tensor (...)
                DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
+               # (ROCm) unexpected success
+               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
+                            device_type='cuda', dtypes=[torch.float64], active_if=TEST_WITH_ROCM),
            )),
     OpInfo('quantile',
            dtypes=floating_types(),
@@ -12886,6 +12624,9 @@ op_db: List[OpInfo] = [
            skips=(
                # test does not work with passing lambda for op
                DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # CUDA runs out of memory
+               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
+                            device_type='cuda', dtypes=[torch.cdouble]),
            )),
     OpInfo('linalg.pinv',
            aten_name='linalg_pinv',
@@ -14075,6 +13816,9 @@ op_db: List[OpInfo] = [
            skips=(
                # Dispatch stub: unsupported device typemeta
                DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad', device_type='meta'),
+               # (ROCm) Memory access fault by GPU node-4 (Agent handle: 0x55860348e690) on address 0x7f0f4ddcb000
+               DecorateInfo(unittest.skip("Skipped! ROCm memory exception"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
+                            device_type='cuda', dtypes=[torch.float64, torch.complex128], active_if=TEST_WITH_ROCM),
            )),
     OpInfo('trapezoid',
            dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
@@ -14085,6 +13829,9 @@ op_db: List[OpInfo] = [
            skips=(
                # Dispatch stub: unsupported device typemeta
                DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad', device_type='meta'),
+               # (ROCm) Memory access fault by GPU node-4 (Agent handle: 0x55bbf53d5500) on address 0x7fe536eb5000
+               DecorateInfo(unittest.skip("Skipped! ROCm memory exception"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
+                            device_type='cuda', dtypes=[torch.float64, torch.complex128], active_if=TEST_WITH_ROCM),
            )),
     OpInfo('cumulative_trapezoid',
            dtypes=all_types_and_complex_and(),
@@ -14593,7 +14340,10 @@ op_db: List[OpInfo] = [
             DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view', device_type='cuda'),
             # On CUDA, the op is dispatched (and a few more conditions) to
             # _fused_dropout, which doesn't support forward AD
-            DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_forward_mode_AD', device_type='cuda'),),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_forward_mode_AD', device_type='cuda'),
+            # (ROCm) NotImplementedError: Trying to use forward AD with native_dropout that does not support it
+            DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad',
+                         device_type='cuda', dtypes=[torch.float64], active_if=TEST_WITH_ROCM),),
         gradcheck_wrapper=wrapper_set_seed,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py
index e8a96d9fa847..bbe28172222d 100644
--- a/torch/utils/data/datapipes/iter/callable.py
+++ b/torch/utils/data/datapipes/iter/callable.py
@@ -1,6 +1,6 @@
 import warnings
 from torch.utils.data import IterDataPipe, _utils, functional_datapipe
-from typing import Callable, Dict, Iterator, Optional, Sized, Tuple, TypeVar
+from typing import Callable, Iterator, Sized, TypeVar
 
 try:
     import dill
@@ -37,8 +37,6 @@ class MapperIterDataPipe(IterDataPipe[T_co]):
               For `input_col` with multiple indices, the left-most one is used, and other indices will be removed.
             - Integer is used for list/tuple. -1 represents to append result at the end.
             - Key is used for dict. New key is acceptable.
-        fn_args: Positional arguments for `fn`
-        fn_kwargs: Keyword arguments for `fn`
     """
     datapipe: IterDataPipe
     fn: Callable
@@ -49,9 +47,6 @@ class MapperIterDataPipe(IterDataPipe[T_co]):
         fn: Callable,
         input_col=None,
         output_col=None,
-        *,
-        fn_args: Optional[Tuple] = None,
-        fn_kwargs: Optional[Dict] = None,
     ) -> None:
         super().__init__()
         self.datapipe = datapipe
@@ -70,20 +65,18 @@ class MapperIterDataPipe(IterDataPipe[T_co]):
                 raise ValueError("`output_col` must be a single-element list or tuple")
             output_col = output_col[0]
         self.output_col = output_col
-        self.args = () if fn_args is None else fn_args
-        self.kwargs = {} if fn_kwargs is None else fn_kwargs
 
     def _apply_fn(self, data):
         if self.input_col is None and self.output_col is None:
-            return self.fn(data, *self.args, **self.kwargs)
+            return self.fn(data)
 
         if self.input_col is None:
-            res = self.fn(data, *self.args, **self.kwargs)
+            res = self.fn(data)
         elif isinstance(self.input_col, (list, tuple)):
             args = tuple(data[col] for col in self.input_col)
-            res = self.fn(*args, *self.args, **self.kwargs)
+            res = self.fn(*args)
         else:
-            res = self.fn(data[self.input_col], *self.args, **self.kwargs)
+            res = self.fn(data[self.input_col])
 
         # Copy tuple to list and run in-place modification because tuple is immutable.
         if isinstance(data, tuple):
@@ -132,8 +125,6 @@ class MapperIterDataPipe(IterDataPipe[T_co]):
             dill_function,
             self.input_col,
             self.output_col,
-            self.args,
-            self.kwargs,
         )
         return state
 
@@ -143,8 +134,6 @@ class MapperIterDataPipe(IterDataPipe[T_co]):
             dill_function,
             self.input_col,
             self.output_col,
-            self.args,
-            self.kwargs,
         ) = state
         if DILL_AVAILABLE:
             self.fn = dill.loads(dill_function)  # type: ignore[assignment]
@@ -163,8 +152,6 @@ class CollatorIterDataPipe(MapperIterDataPipe):
         datapipe: Iterable DataPipe being collated
         collate_fn: Customized collate function to collect and combine data or a batch of data.
             Default function collates to Tensor(s) based on data type.
-        fn_args: Positional arguments for `collate_fn`
-        fn_kwargs: Keyword arguments for `collate_fn`
 
     Example: Convert integer data to float Tensor
         >>> class MyIterDataPipe(torch.utils.data.IterDataPipe):
@@ -196,7 +183,5 @@ class CollatorIterDataPipe(MapperIterDataPipe):
         self,
         datapipe: IterDataPipe,
         collate_fn: Callable = _utils.collate.default_collate,
-        fn_args: Optional[Tuple] = None,
-        fn_kwargs: Optional[Dict] = None,
     ) -> None:
-        super().__init__(datapipe, fn=collate_fn, fn_args=fn_args, fn_kwargs=fn_kwargs)
+        super().__init__(datapipe, fn=collate_fn)
diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py
index 2edb0017b042..bd31d938c1b2 100644
--- a/torch/utils/data/datapipes/iter/selecting.py
+++ b/torch/utils/data/datapipes/iter/selecting.py
@@ -1,5 +1,5 @@
 import warnings
-from typing import Callable, Dict, Iterator, Optional, Tuple, TypeVar
+from typing import Callable, Iterator, TypeVar
 
 from torch.utils.data import IterDataPipe, functional_datapipe
 from torch.utils.data.datapipes.dataframe import dataframe_wrapper as df_wrapper
@@ -28,8 +28,6 @@ class FilterIterDataPipe(IterDataPipe[T_co]):
     Args:
         datapipe: Iterable DataPipe being filtered
         filter_fn: Customized function mapping an element to a boolean.
-        fn_args: Positional arguments for `filter_fn`
-        fn_kwargs: Keyword arguments for `filter_fn`
         drop_empty_batches: By default, drops batch if it is empty after filtering instead of keeping an empty list
     """
     datapipe: IterDataPipe
@@ -39,8 +37,6 @@ class FilterIterDataPipe(IterDataPipe[T_co]):
     def __init__(self,
                  datapipe: IterDataPipe,
                  filter_fn: Callable,
-                 fn_args: Optional[Tuple] = None,
-                 fn_kwargs: Optional[Dict] = None,
                  drop_empty_batches: bool = True,
                  ) -> None:
         super().__init__()
@@ -50,8 +46,6 @@ class FilterIterDataPipe(IterDataPipe[T_co]):
             warnings.warn("Lambda function is not supported for pickle, please use "
                           "regular python function or functools.partial instead.")
         self.filter_fn = filter_fn  # type: ignore[assignment]
-        self.args = () if fn_args is None else fn_args
-        self.kwargs = {} if fn_kwargs is None else fn_kwargs
         self.drop_empty_batches = drop_empty_batches
 
     def __iter__(self) -> Iterator[T_co]:
@@ -62,7 +56,7 @@ class FilterIterDataPipe(IterDataPipe[T_co]):
                 yield filtered
 
     def _returnIfTrue(self, data):
-        condition = self.filter_fn(data, *self.args, **self.kwargs)
+        condition = self.filter_fn(data)
 
         if df_wrapper.is_column(condition):
             # We are operating on DataFrames filter here
@@ -95,11 +89,11 @@ class FilterIterDataPipe(IterDataPipe[T_co]):
             dill_function = dill.dumps(self.filter_fn)
         else:
             dill_function = self.filter_fn
-        state = (self.datapipe, dill_function, self.args, self.kwargs, self.drop_empty_batches)
+        state = (self.datapipe, dill_function, self.drop_empty_batches)
         return state
 
     def __setstate__(self, state):
-        (self.datapipe, dill_function, self.args, self.kwargs, self.drop_empty_batches) = state
+        (self.datapipe, dill_function, self.drop_empty_batches) = state
         if DILL_AVAILABLE:
             self.filter_fn = dill.loads(dill_function)  # type: ignore[assignment]
         else:
diff --git a/torch/utils/data/datapipes/map/callable.py b/torch/utils/data/datapipes/map/callable.py
index 8dbad957e069..a7527d24e248 100644
--- a/torch/utils/data/datapipes/map/callable.py
+++ b/torch/utils/data/datapipes/map/callable.py
@@ -1,5 +1,5 @@
 import warnings
-from typing import Callable, Dict, Optional, Tuple, TypeVar
+from typing import Callable, TypeVar
 
 from torch.utils.data import MapDataPipe, functional_datapipe
 
@@ -35,8 +35,6 @@ class MapperMapDataPipe(MapDataPipe[T_co]):
     args:
         datapipe: Source Map DataPipe
         fn: Function called over each item
-        fn_args: Positional arguments for `fn`
-        fn_kwargs: Keyword arguments for `fn`
     """
     datapipe: MapDataPipe
     fn: Callable
@@ -45,8 +43,6 @@ class MapperMapDataPipe(MapDataPipe[T_co]):
         self,
         datapipe: MapDataPipe,
         fn: Callable = default_fn,
-        fn_args: Optional[Tuple] = None,
-        fn_kwargs: Optional[Dict] = None,
     ) -> None:
         super().__init__()
         self.datapipe = datapipe
@@ -57,25 +53,23 @@ class MapperMapDataPipe(MapDataPipe[T_co]):
                 "regular python function or functools.partial instead."
             )
         self.fn = fn  # type: ignore[assignment]
-        self.args = () if fn_args is None else fn_args
-        self.kwargs = {} if fn_kwargs is None else fn_kwargs
 
     def __len__(self) -> int:
         return len(self.datapipe)
 
     def __getitem__(self, index) -> T_co:
-        return self.fn(self.datapipe[index], *self.args, **self.kwargs)
+        return self.fn(self.datapipe[index])
 
     def __getstate__(self):
         if DILL_AVAILABLE:
             dill_function = dill.dumps(self.fn)
         else:
             dill_function = self.fn
-        state = (self.datapipe, dill_function, self.args, self.kwargs)
+        state = (self.datapipe, dill_function)
         return state
 
     def __setstate__(self, state):
-        (self.datapipe, dill_function, self.args, self.kwargs) = state
+        (self.datapipe, dill_function) = state
         if DILL_AVAILABLE:
             self.fn = dill.loads(dill_function)  # type: ignore[assignment]
         else:
diff --git a/torch/utils/data/dataset.pyi b/torch/utils/data/dataset.pyi
index f2ac8102ea2b..a84a1bee364d 100644
--- a/torch/utils/data/dataset.pyi
+++ b/torch/utils/data/dataset.pyi
@@ -44,7 +44,7 @@ class MapDataPipe(Generic[T_co]):
     # Functional form of 'ConcaterMapDataPipe'
     def concat(self, *datapipes: MapDataPipe) -> MapDataPipe: ...
     # Functional form of 'MapperMapDataPipe'
-    def map(self, fn: Callable= ..., fn_args: Optional[Tuple] = None, fn_kwargs: Optional[Dict] = None) -> MapDataPipe: ...
+    def map(self, fn: Callable= ...) -> MapDataPipe: ...
     # Functional form of 'ShufflerMapDataPipe'
     def shuffle(self, *, indices: Optional[List] = None) -> MapDataPipe: ...
     # Functional form of 'ZipperMapDataPipe'
@@ -65,7 +65,7 @@ class IterableDataset(Dataset[T_co], metaclass=_DataPipeMeta):
     # Functional form of 'BatcherIterDataPipe'
     def batch(self, batch_size: int, drop_last: bool = False, wrapper_class=DataChunk) -> IterDataPipe: ...
     # Functional form of 'CollatorIterDataPipe'
-    def collate(self, collate_fn: Callable= ..., fn_args: Optional[Tuple] = None, fn_kwargs: Optional[Dict] = None) -> IterDataPipe: ...
+    def collate(self, collate_fn: Callable= ...) -> IterDataPipe: ...
     # Functional form of 'ConcaterIterDataPipe'
     def concat(self, *datapipes: IterDataPipe) -> IterDataPipe: ...
     # Functional form of 'RoutedDecoderIterDataPipe'
@@ -73,13 +73,13 @@ class IterableDataset(Dataset[T_co], metaclass=_DataPipeMeta):
     # Functional form of 'DemultiplexerIterDataPipe'
     def demux(self, num_instances: int, classifier_fn: Callable[[T_co], Optional[int]], drop_none: bool = False, buffer_size: int = 1000) -> List[IterDataPipe]: ...
     # Functional form of 'FilterIterDataPipe'
-    def filter(self, filter_fn: Callable, fn_args: Optional[Tuple] = None, fn_kwargs: Optional[Dict] = None, drop_empty_batches: bool = True) -> IterDataPipe: ...
+    def filter(self, filter_fn: Callable, drop_empty_batches: bool = True) -> IterDataPipe: ...
     # Functional form of 'ForkerIterDataPipe'
     def fork(self, num_instances: int, buffer_size: int = 1000) -> List[IterDataPipe]: ...
     # Functional form of 'GrouperIterDataPipe'
     def groupby(self, group_key_fn: Callable, *, buffer_size: int = 10000, group_size: Optional[int] = None, unbatch_level: int = 0, guaranteed_group_size: Optional[int] = None, drop_remaining: bool = False) -> IterDataPipe: ...
     # Functional form of 'MapperIterDataPipe'
-    def map(self, fn: Callable, input_col=None, output_col=None, *, fn_args: Optional[Tuple] = None, fn_kwargs: Optional[Dict] = None) -> IterDataPipe: ...
+    def map(self, fn: Callable, input_col=None, output_col=None) -> IterDataPipe: ...
     # Functional form of 'MultiplexerIterDataPipe'
     def mux(self, *datapipes) -> IterDataPipe: ...
     # Functional form of 'ShardingFilterIterDataPipe'