Update base for Update on "[NJT] Actually inline NT torch function during dynamo"

In order to avoid having any temporary state where the behavior of anything is regressed. This PR does all of the following at once: (1) Disables torch function running a second time in AOTAutograd If you have a tensor subclass that relies on dispatch into the same op without unwrapping and calling torch._C.DisableTorchFunctionSubclass() the torch function-ness will survive into AOTAutograd (when normally we may expect the torch function to be inlined away during dynamo). If this happens, we should make sure to not run the torch function logic a second time. 2. Enables torch function to be inlined in dynamo for NT Due to torch function running a second time AOTAutograd, NT was actually relying on this behavior instead of properly inlining through torch function at the dynamo level. 3. Fixes graph breaks for NT torch function Now that we are inlining through torch function for the first time in dynamo, we've uncovered some graph breaks. Thanks to mlazos, we should have support for custom attributes for torch function now. We also add support for a custom Enum type. Finally, a few of them we can get rid of by adding allow_in_graph (though we may need to double check the soundness here). Fixes https://github.com/pytorch/pytorch/issues/120654, https://github.com/pytorch/pytorch/issues/120124 cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang [ghstack-poisoned]
2025-11-11 22:34:53 +08:00 · 2024-04-01 17:02:30 -07:00 · 2024-04-01 11:22:39 -07:00 · 2024-04-01 17:50:35 +00:00 · 2024-04-01 17:45:16 +00:00 · 2024-04-01 17:41:59 +00:00
545 changed files with 11043 additions and 5045 deletions
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-7f96f5a852ba452670255d28d59f1e6398141fbb
+d4b3e5cc607e97afdba79dc90f8ef968142f347c
--- a/.clang-tidy
+++ b/.clang-tidy
@ -36,6 +36,7 @@ hicpp-exception-baseclass,
 hicpp-avoid-goto,
 misc-*,
 -misc-const-correctness,
+-misc-include-cleaner,
 -misc-use-anonymous-namespace,
 -misc-unused-parameters,
 -misc-no-recursion,
--- a/.github/actions/pytest-cache-download/action.yml
+++ b/.github/actions/pytest-cache-download/action.yml
@ -10,9 +10,9 @@ inputs:
    description: Text that uniquely identifies a given job type within a workflow. All shards of a job should share the same job identifier.
    required: true
  s3_bucket:
-    description: S3 bucket to upload/download PyTest cache
+    description: S3 bucket to download PyTest cache
    required: false
-    default: ""
+    default: "gha-artifacts"

 runs:
  using: composite
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-17a70815259222570feb071034acd7bae2adc019
+ea437b31ce316ea3d66fe73768c0dcb94edb79ad
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-a0c79b399b75368208464b2c638708165cca7ef1
+2c4665ffbb64f03f5d18016d3398af4ac4da5f03
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-707a632930bfde19ffb361cdf5c31a7682af4e67
+b0ba29f98a695671972d4a4cc07441014dba2892
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -35,6 +35,8 @@
 - test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
 - torch/distributed/_tensor/**
 - torch/distributed/fsdp/**
+- torch/csrc/inductor/**
+- test/cpp/aot_inductor/**

 "module: cpu":
 - aten/src/ATen/cpu/**
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -236,6 +236,20 @@
  - Lint
  - pull

+- name: XPU ATen
+  patterns:
+  - aten/src/ATen/xpu/**
+  - c10/xpu/**
+  - third_party/xpu.txt
+  approved_by:
+  - EikanWang
+  - jgong5
+  - gujinghui
+  mandatory_checks_name:
+  - EasyCLA
+  - Lint
+  - pull
+
 - name: Distributions
  patterns:
  - torch/distributions/**
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -37,7 +37,7 @@ jobs:

  linux-jammy-py3_8-gcc11-build:
    name: linux-jammy-py3.8-gcc11
-    uses: ./.github/workflows/_linux-build.yml
+    uses: ./.github/workflows/_linux-build-rg.yml
    with:
      build-environment: linux-jammy-py3.8-gcc11
      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -45,10 +45,12 @@ jobs:
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-test:
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -192,6 +192,8 @@ include_patterns = [
    'aten/src/ATen/*.cpp',
    'aten/src/ATen/core/*.h',
    'aten/src/ATen/core/*.cpp',
+    'aten/src/ATen/functorch/*.h',
+    'aten/src/ATen/functorch/*.cpp',
    'c10/**/*.cpp',
    'c10/**/*.h',
    'torch/csrc/*.h',
@ -1906,6 +1908,7 @@ exclude_patterns = [
    'torch/compiler/__init__.py',
    'torch/contrib/__init__.py',
    'torch/contrib/_tensorboard_vis.py',
+    "torch/cuda/_gpu_trace.py",
    'torch/cuda/_memory_viz.py',  # mypy: Value of type "object" is not indexable
    'torch/distributed/__init__.py',
    'torch/distributed/_composable_state.py',
@ -2371,7 +2374,7 @@ exclude_patterns = [
    'torch/testing/_internal/common_subclass.py',
    'torch/testing/_internal/common_utils.py',
    'torch/testing/_internal/composite_compliance.py',
-    'torch/testing/_internal/control_flow_opinfo_db.py',
+    'torch/testing/_internal/hop_db.py',
    'torch/testing/_internal/custom_op_db.py',
    'torch/testing/_internal/data/__init__.py',
    'torch/testing/_internal/data/network1.py',
@ -2433,7 +2436,6 @@ exclude_patterns = [
    'torch/utils/_contextlib.py',
    'torch/utils/_cpp_extension_versioner.py',
    'torch/utils/_crash_handler.py',
-    'torch/utils/_cuda_trace.py',
    'torch/utils/_device.py',
    'torch/utils/_foreach_utils.py',
    'torch/utils/_freeze.py',
@ -2442,7 +2444,6 @@ exclude_patterns = [
    'torch/utils/_stats.py',
    'torch/utils/_sympy/__init__.py',
    'torch/utils/_sympy/functions.py',
-    'torch/utils/_sympy/value_ranges.py',
    'torch/utils/_traceback.py',
    'torch/utils/_zip.py',
    'torch/utils/backcompat/__init__.py',
@ -2562,6 +2563,7 @@ exclude_patterns = [
    'torch/utils/viz/__init__.py',
    'torch/utils/viz/_cycles.py',
    'torch/utils/weak.py',
+    'torch/xpu/_gpu_trace.py',
 ]
 init_command = [
    'python3',
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -742,13 +742,28 @@ if(MSVC)
  append_cxx_flag_if_supported("/utf-8" CMAKE_CXX_FLAGS)
 endif()

-# CAVEAT: do NOT check USE_ROCM here, because USE_ROCM is always True until
-#         include(cmake/Dependencies.cmake)
+# Note for ROCM platform:
+# 1. USE_ROCM is always ON until include(cmake/Dependencies.cmake)
+# 2. USE_CUDA will become OFF during re-configuration
+# Truth Table:
+# CUDA 1st pass: USE_CUDA=True;USE_ROCM=True, FLASH evaluates to ON by default
+# CUDA 2nd pass: USE_CUDA=True;USE_ROCM=False, FLASH evaluates to ON by default
+# ROCM 1st pass: USE_CUDA=True;USE_ROCM=True, FLASH evaluates to ON by default
+# ROCM 2nd pass: USE_CUDA=False;USE_ROCM=True, FLASH evaluates to ON by default
+# CPU 1st pass: USE_CUDA=False(Cmd Option);USE_ROCM=True, FLASH evaluates to OFF by default
+# CPU 2nd pass: USE_CUDA=False(Cmd Option);USE_ROCM=False, FLASH evaluates to OFF by default
+# Thus we cannot tell ROCM 2nd pass and CPU 1st pass
+#
+# The only solution is to include(cmake/Dependencies.cmake), and defer the
+# aotriton build decision later.
+
+include(cmake/Dependencies.cmake)
+
 cmake_dependent_option(
  USE_FLASH_ATTENTION
  "Whether to build the flash_attention kernel for scaled dot product attention.\
  Will be disabled if not supported by the platform" ON
-  "USE_CUDA AND NOT MSVC" OFF)
+  "USE_CUDA OR USE_ROCM;NOT MSVC" OFF)

 # We are currenlty not using alibi attention for Flash
 # So we disable this feature by default
@ -764,8 +779,6 @@ cmake_dependent_option(
  Will be disabled if not supported by the platform" ON
  "USE_CUDA" OFF)

-include(cmake/Dependencies.cmake)
-
 if(DEBUG_CUDA)
  string(APPEND CMAKE_CUDA_FLAGS_DEBUG " -lineinfo")
  string(APPEND CMAKE_CUDA_FLAGS_RELWITHDEBINFO " -lineinfo")
--- a/1
+++ b/1
@ -67,6 +67,7 @@ nn/qat/ @jerryzh168
 /test/run_test.py @pytorch/pytorch-dev-infra
 /torch/testing/_internal/common_device_type.py @mruberry
 /torch/testing/_internal/common_utils.py @pytorch/pytorch-dev-infra
+/torch/testing/_internal/hop_db.py @tugsbayasgalan @zou3519 @ydwu4

 # Parametrizations
 /torch/nn/utils/parametriz*.py @lezcano
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$")
  list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo)
 endif()

-if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
-  # Preserve values for the main build
-  set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS})
-  set(__aten_sleef_build_tests ${BUILD_TESTS})
-
-  # Unset our restrictive C++ flags here and reset them later.
-  # Remove this once we use proper target_compile_options.
-  set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-  set(CMAKE_CXX_FLAGS)
-
-  # Bump up optimization level for sleef to -O1, since at -O0 the compiler
-  # excessively spills intermediate vector registers to the stack
-  # and makes things run impossibly slowly
-  set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
-  if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
-    string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
-  else()
-    set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
+if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
+  if(NOT MSVC)
+    # Bump up optimization level for sleef to -O1, since at -O0 the compiler
+    # excessively spills intermediate vector registers to the stack
+    # and makes things run impossibly slowly
+    set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+    if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
+      string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
+    else()
+      set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
+    endif()
  endif()

  if(NOT USE_SYSTEM_SLEEF)
-    set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
-    set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
-    set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
-    set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
-    set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE})
+    set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
+    set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
+    set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
+    set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
+    set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE)
    if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
      if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
        set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE)
@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
  endif()
  list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef)

-  set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
-  set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
-
-  # Set these back. TODO: Use SLEEF_ to pass these instead
-  set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE)
-  set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE)
+  if(NOT MSVC)
+    set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
+  endif()
 endif()

 if(USE_CUDA AND NOT USE_ROCM)
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -484,8 +484,8 @@ c10::optional<Tensor> to_functional_tensor(const c10::optional<Tensor>& tensor)
  }
  return c10::nullopt;
 }
-c10::List<c10::optional<Tensor>> to_functional_tensor(const c10::List<c10::optional<Tensor>>& t_list) {
-  c10::List<c10::optional<Tensor>> outputs;
+c10::List<::std::optional<Tensor>> to_functional_tensor(const c10::List<::std::optional<Tensor>>& t_list) {
+  c10::List<::std::optional<Tensor>> outputs;
  outputs.reserve(t_list.size());
  for (const auto i : c10::irange(t_list.size())) {
    outputs.push_back(to_functional_tensor(t_list[i]));
@ -536,8 +536,8 @@ std::vector<Tensor> from_functional_tensor(ITensorListRef t_list) {
  }
  return outputs;
 }
-c10::List<c10::optional<Tensor>> from_functional_tensor(const c10::List<c10::optional<Tensor>>& t_list) {
-  c10::List<c10::optional<Tensor>> outputs;
+c10::List<::std::optional<Tensor>> from_functional_tensor(const c10::List<::std::optional<Tensor>>& t_list) {
+  c10::List<::std::optional<Tensor>> outputs;
  outputs.reserve(t_list.size());
  for (const auto i : c10::irange(t_list.size())) {
    outputs.push_back(from_functional_tensor(t_list[i], /*assert_functional=*/false));
@ -572,7 +572,7 @@ void sync(ITensorListRef t_list) {
    sync(t);
  }
 }
-void sync(const c10::List<c10::optional<Tensor>>& t_list) {
+void sync(const c10::List<::std::optional<Tensor>>& t_list) {
  for (const auto i : c10::irange(t_list.size())) {
    sync(t_list[i]);
  }
@ -652,7 +652,7 @@ bool isFunctionalTensor(const c10::optional<Tensor>& t) {
  }
 }

-bool isFunctionalTensor(const c10::List<c10::optional<Tensor>>& t_list) {
+bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
  if (t_list.empty()) return false;
  auto functional_count = 0;
  for (const auto i : c10::irange(t_list.size())) {
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@ -317,10 +317,10 @@ static inline void recordTensorIndex(
  (*dim_ptr)++;
 };

-static inline c10::List<c10::optional<Tensor>> typeConvertIndices(
+static inline c10::List<::std::optional<Tensor>> typeConvertIndices(
    const Tensor& /*self*/,
    std::vector<Tensor>&& indices) {
-  c10::List<c10::optional<Tensor>> converted_inds;
+  c10::List<::std::optional<Tensor>> converted_inds;
  converted_inds.reserve(indices.size());
  for (auto&& i : std::move(indices)) {
    converted_inds.push_back(std::move(i));
--- a/aten/src/ATen/core/Generator.cpp
+++ b/aten/src/ATen/core/Generator.cpp
@ -13,4 +13,12 @@ at::Tensor Generator::get_state() const {
  return at::Tensor::wrap_tensor_impl(this->impl_->get_state());
 }

+void Generator::graphsafe_set_state(const Generator& new_state) {
+  this->impl_->graphsafe_set_state(new_state.getIntrusivePtr());
+}
+
+Generator Generator::graphsafe_get_state() const {
+  return Generator(this->impl_->graphsafe_get_state());
+}
+
 } // namespace at
--- a/aten/src/ATen/core/Generator.h
+++ b/aten/src/ATen/core/Generator.h
@ -107,6 +107,10 @@ struct TORCH_API Generator {

  at::Tensor get_state() const;

+  void graphsafe_set_state(const Generator& new_state);
+
+  Generator graphsafe_get_state() const;
+
  std::mutex& mutex() {
    return impl_->mutex_;
  }
--- a/aten/src/ATen/core/op_registration/op_registration_test.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp
@ -1154,15 +1154,15 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) {
    "(int[]? a) -> int[]?");

  // Test list of optional (with empty list)
-  testArgTypes<c10::List<c10::optional<int64_t>>>::test(
-    c10::List<c10::optional<int64_t>>(c10::List<c10::optional<int64_t>>({})), [] (const c10::List<c10::optional<int64_t>>& v) {EXPECT_EQ(0, v.size());},
-    c10::List<c10::optional<int64_t>>(c10::List<c10::optional<int64_t>>({})), [] (const IValue& v) {EXPECT_EQ(0, v.to<c10::List<c10::optional<int64_t>>>().size());},
+  testArgTypes<c10::List<::std::optional<int64_t>>>::test(
+    c10::List<::std::optional<int64_t>>(c10::List<::std::optional<int64_t>>({})), [] (const c10::List<::std::optional<int64_t>>& v) {EXPECT_EQ(0, v.size());},
+    c10::List<::std::optional<int64_t>>(c10::List<::std::optional<int64_t>>({})), [] (const IValue& v) {EXPECT_EQ(0, v.to<c10::List<::std::optional<int64_t>>>().size());},
    "(int?[] a) -> int?[]");

  // Test list of optional (with values)
-  testArgTypes<c10::List<c10::optional<int64_t>>>::test(
-    c10::List<c10::optional<int64_t>>(c10::List<c10::optional<int64_t>>({3, c10::nullopt, 2})), [] (const c10::List<c10::optional<int64_t>>& v) {expectListEquals<c10::optional<int64_t>>({3, c10::nullopt, 2}, v);},
-    c10::List<c10::optional<int64_t>>(c10::List<c10::optional<int64_t>>({3, c10::nullopt, 2})), [] (const IValue& v) {expectListEquals<c10::optional<int64_t>>({3, c10::nullopt, 2}, v.to<c10::List<c10::optional<int64_t>>>());},
+  testArgTypes<c10::List<::std::optional<int64_t>>>::test(
+    c10::List<::std::optional<int64_t>>(c10::List<::std::optional<int64_t>>({3, c10::nullopt, 2})), [] (const c10::List<::std::optional<int64_t>>& v) {expectListEquals<c10::optional<int64_t>>({3, c10::nullopt, 2}, v);},
+    c10::List<::std::optional<int64_t>>(c10::List<::std::optional<int64_t>>({3, c10::nullopt, 2})), [] (const IValue& v) {expectListEquals<c10::optional<int64_t>>({3, c10::nullopt, 2}, v.to<c10::List<::std::optional<int64_t>>>());},
    "(int?[] a) -> int?[]");

  // dict types
@ -1234,15 +1234,15 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) {
    "(Dict(int, Tensor) a) -> Dict(int, Tensor)");

  // weird deeply nested type
-  using DeeplyNestedType = c10::List<c10::Dict<std::string, c10::List<c10::optional<c10::Dict<int64_t, std::string>>>>>;
+  using DeeplyNestedType = c10::List<c10::Dict<std::string, c10::List<::std::optional<c10::Dict<int64_t, std::string>>>>>;
  auto makeDeeplyNestedObject = [] () -> DeeplyNestedType {
    c10::Dict<int64_t, std::string> inner3;
    inner3.insert(1, "1");
-    c10::List<c10::optional<c10::Dict<int64_t, std::string>>> inner2;
+    c10::List<::std::optional<c10::Dict<int64_t, std::string>>> inner2;
    inner2.push_back(std::move(inner3));
-    c10::Dict<std::string, c10::List<c10::optional<c10::Dict<int64_t, std::string>>>> inner1;
+    c10::Dict<std::string, c10::List<::std::optional<c10::Dict<int64_t, std::string>>>> inner1;
    inner1.insert("key", std::move(inner2));
-    c10::List<c10::Dict<std::string, c10::List<c10::optional<c10::Dict<int64_t, std::string>>>>> result;
+    c10::List<c10::Dict<std::string, c10::List<::std::optional<c10::Dict<int64_t, std::string>>>>> result;
    result.push_back(inner1);
    return result;
  };
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@ -22,6 +22,9 @@
 #include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
 #endif

+#include <ATen/cpu/vec/vec256/vec256_convert.h>
+#include <ATen/cpu/vec/vec256/vec256_mask.h>
+
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
@ -69,7 +72,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
 }


-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -94,7 +97,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
+#ifndef _MSC_VER
+// MSVC is not working well on complex function overload.
 template<int64_t scale = 1>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
 inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
@ -106,9 +110,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
 inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
  return _mm256_i32gather_ps(base_addr, vindex, scale);
 }
-
+#endif
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
+#ifndef _MSC_VER
+// MSVC is not working well on complex function overload.
 template<int64_t scale = 1>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
 inline mask_gather(const Vectorized<double>& src, const double* base_addr,
@ -122,7 +127,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
                   const Vectorized<int32_t>& vindex, Vectorized<float>& mask) {
  return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale);
 }
-
+#endif
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 // Only works for inputs in the range: [-2^51, 2^51]
@ -302,6 +307,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
  return flip8(v);
 }

-#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#endif // (defined(CPU_CAPABILITY_AVX2)

 }} // namepsace at::vec::CPU_CAPABILITY
--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
@ -7,7 +7,8 @@
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>

-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif

@ -18,7 +19,18 @@ namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
+
+#ifndef SLEEF_CONST
+#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
+#define SLEEF_CONST const
+#else
+#define SLEEF_CONST
+#endif
+#define SLEEF_CONST_OLD SLEEF_CONST
+#else
+#define SLEEF_CONST_OLD
+#endif

 // bfloat16 conversion
 static inline void cvtbf16_fp32(const __m128i& a, __m256& o) {
@ -31,6 +43,28 @@ static inline void cvtbf16_fp32(const __m256i& a, __m256& o1, __m256& o2) {
  cvtbf16_fp32(lo, o1);
  cvtbf16_fp32(hi, o2);
 }
+
+static inline __m128i cvtfp32_bf16(const __m256& src) {
+  __m256i value = _mm256_castps_si256(src);
+  __m256i nan = _mm256_set1_epi32(0xffff);
+  __m256i mask = _mm256_castps_si256(_mm256_cmp_ps(src, src, _CMP_ORD_Q));
+  __m256i ones = _mm256_set1_epi32(0x1);
+  __m256i vec_bias = _mm256_set1_epi32(0x7fff);
+  // uint32_t lsb = (input >> 16) & 1;
+  auto t_value = _mm256_and_si256(_mm256_srli_epi32(value, 16), ones);
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t_value = _mm256_add_epi32(t_value, vec_bias);
+  // input += rounding_bias;
+  t_value = _mm256_add_epi32(t_value, value);
+  // input = input >> 16;
+  t_value = _mm256_srli_epi32(t_value, 16);
+  // Check NaN before converting back to bf16
+  t_value = _mm256_blendv_epi8(nan, t_value, mask);
+  t_value = _mm256_packus_epi32(t_value, t_value);   // t[4-7] t[4-7] t[0-4] t[0-4]
+  t_value = _mm256_permute4x64_epi64(t_value, 0xd8); // 11     01     10     00
+  return _mm256_castsi256_si128(t_value);
+}
+
 static inline __m256i cvtfp32_bf16(const __m256& a, const __m256& b) {
  __m256i lo = _mm256_castps_si256(a);
  __m256i hi = _mm256_castps_si256(b);
@ -80,6 +114,11 @@ static inline void cvtfp16_fp32(const __m256i& a, __m256& o1, __m256& o2) {
  cvtfp16_fp32(hi, o2);
 }

+static inline __m128i cvtfp32_fp16(const __m256& src) {
+  return _mm256_cvtps_ph(
+      src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+}
+
 static inline __m256i cvtfp32_fp16(const __m256& a, const __m256& b) {
  __m128i lo = _mm256_cvtps_ph(
      a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
@ -265,7 +304,8 @@ public:
    }
    return b;
  }
-  Vectorized<T> map(const __m256 (*const vop)(__m256)) const {
+
+  Vectorized<T> map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const {
    __m256 lo, hi;
    cvt_to_fp32<T>(values, lo, hi);
    const auto o1 = vop(lo);
@ -1026,7 +1066,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
 CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
 CONVERT_VECTORIZED_INIT(Half, half);

-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#else // defined(CPU_CAPABILITY_AVX2)

 #define CONVERT_NON_VECTORIZED_INIT(type, name) \
 inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
@ -1049,11 +1089,39 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
  return Vectorized<type>::loadu(arr2); \
 }
 CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_half_float(const Vectorized<Half>& a) {
+  static_assert(Vectorized<Half>::size() == 2 * Vectorized<float>::size());
+  auto arr = reinterpret_cast<const float16_t*>(a.operator const Half*());
+  float16x8_t x = vld1q_f16(arr);
+  float32x4_t x1 = vcvt_f32_f16(vget_low_f16(x));
+  float32x4_t x2 = vcvt_f32_f16(vget_high_f16(x));
+  float16x8_t y = vld1q_f16(arr + Vectorized<float>::size());
+  float32x4_t y1 = vcvt_f32_f16(vget_low_f16(y));
+  float32x4_t y2 = vcvt_f32_f16(vget_high_f16(y));
+  return { Vectorized<float>(x1, x2), Vectorized<float>(y1, y2) };
+}
+inline Vectorized<Half> convert_float_half(const Vectorized<float>& a, const Vectorized<float>& b) {
+  static_assert(Vectorized<Half>::size() == 2 * Vectorized<float>::size());
+  float32x4x2_t x = a;
+  float32x4x2_t y = b;
+  float16x4_t x1 = vcvt_f16_f32(x.val[0]);
+  float16x4_t x2 = vcvt_f16_f32(x.val[1]);
+  float16x4_t y1 = vcvt_f16_f32(y.val[0]);
+  float16x4_t y2 = vcvt_f16_f32(y.val[1]);
+  Vectorized<Half> rc;
+  auto arr = reinterpret_cast<float16_t*>(rc.operator Half*());
+  vst1q_f16(arr, vcombine_f16(x1, x2));
+  vst1q_f16(arr + Vectorized<float>::size(), vcombine_f16(y1, y2));
+  return rc;
+}
+#else
 CONVERT_NON_VECTORIZED_INIT(Half, half);
+#endif

-#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#endif // defined(CPU_CAPABILITY_AVX2)

-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
 #define LOAD_FP32_VECTORIZED_INIT(type, name) \
 inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
  auto values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data)); \
@ -1072,7 +1140,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
 LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
 LOAD_FP32_VECTORIZED_INIT(Half, fp16);

-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#else // defined(CPU_CAPABILITY_AVX2)
 #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
 inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
  __at_align__ float values[Vectorized<float>::size()]; \
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
@ -8,7 +8,8 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>

-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif

@ -16,7 +17,7 @@ namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)

 template <> class Vectorized<c10::complex<double>> {
 private:
@ -145,7 +146,7 @@ public:
    auto abs = abs_();
    auto zero = _mm256_setzero_pd();
    auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ);
-    auto div = values / abs;
+    auto div = _mm256_div_pd(values, abs);
    return _mm256_blendv_pd(div, zero, mask);
  }
  __m256d real_() const {
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
@ -7,7 +7,8 @@
 #include <c10/util/irange.h>
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif

@ -15,7 +16,7 @@ namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)

 template <> class Vectorized<c10::complex<float>> {
 private:
@ -180,7 +181,7 @@ public:
    auto abs = abs_();
    auto zero = _mm256_setzero_ps();
    auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ);
-    auto div = values / abs;
+    auto div = _mm256_div_ps(values, abs);
    return _mm256_blendv_ps(div, zero, mask);
  }
  __m256 real_() const {
--- a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
@ -0,0 +1,173 @@
+#pragma once
+
+#include <ATen/cpu/vec/functional_bfloat16.h>
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec_convert.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+
+template <>
+struct VecConvert<float, 1, BFloat16, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<BFloat16, 1>& src) {
+    VectorizedN<float, 1> result;
+    __m256 value;
+    cvtbf16_fp32(_mm256_castsi256_si128(src[0]), value);
+    result[0] = value;
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<float, 1, Half, 1> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<Half, 1>& src) {
+    VectorizedN<float, 1> result;
+    __m256 value;
+    cvtfp16_fp32(_mm256_castsi256_si128(src[0]), value);
+    result[0] = value;
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<BFloat16, 1, float, 1> {
+  static inline VectorizedN<BFloat16, 1> apply(
+      const VectorizedN<float, 1>& src) {
+    VectorizedN<BFloat16, 1> result;
+    result[0] = _mm256_castsi128_si256(cvtfp32_bf16(src[0]));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<Half, 1, float, 1> {
+  static inline VectorizedN<Half, 1> apply(const VectorizedN<float, 1>& src) {
+    VectorizedN<Half, 1> result;
+    result[0] = _mm256_castsi128_si256(cvtfp32_fp16(src[0]));
+    return result;
+  }
+};
+
+template <>
+inline Vectorized<double> convert_to_fp_of_same_size<double>(
+    const Vectorized<int64_t>& src);
+
+template <>
+struct VecConvert<float, 1, int64_t, 2> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<int64_t, 2>& src) {
+    auto low_double = at::vec::convert_to_fp_of_same_size<double>(src[0]);
+    auto low = _mm256_cvtpd_ps(low_double);
+    auto high_double = at::vec::convert_to_fp_of_same_size<double>(src[1]);
+    auto high = _mm256_cvtpd_ps(high_double);
+    return Vectorized<float>(
+        _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1));
+  }
+};
+
+template <>
+inline Vectorized<int32_t> convert_to_int_of_same_size<float>(
+    const Vectorized<float>& src);
+
+template <>
+struct VecConvert<int64_t, 2, float, 1> {
+  static inline VectorizedN<int64_t, 2> apply(
+      const VectorizedN<float, 1>& src) {
+    at::vec::VectorizedN<int64_t, 2> result;
+    auto int32_vec = at::vec::convert_to_int_of_same_size(src[0]);
+    result[0] = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(int32_vec));
+    result[1] = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(int32_vec, 1));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, int64_t, 2> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<int64_t, 2>& src) {
+    auto low = _mm256_shuffle_epi32(src[0], _MM_SHUFFLE(2, 0, 2, 0));
+    auto high = _mm256_shuffle_epi32(src[1], _MM_SHUFFLE(2, 0, 2, 0));
+    auto low_perm = _mm256_permute4x64_epi64(low, _MM_SHUFFLE(3, 1, 2, 0));
+    auto high_perm = _mm256_permute4x64_epi64(high, _MM_SHUFFLE(3, 1, 2, 0));
+    return Vectorized<int32_t>(_mm256_blend_epi32(low_perm, high_perm, 0xF0));
+  }
+};
+
+template <>
+struct VecConvert<int64_t, 2, int32_t, 1> {
+  static inline VectorizedN<int64_t, 2> apply(
+      const VectorizedN<int32_t, 1>& src) {
+    at::vec::VectorizedN<int64_t, 2> result;
+    result[0] = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(src[0]));
+    result[1] = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(src[0], 1));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, int8_t, 1> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<int8_t, 1>& src) {
+    auto src128 = _mm256_castsi256_si128(src[0]);
+    return Vectorized<int32_t>(_mm256_cvtepi8_epi32(src128));
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, uint8_t, 1> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<uint8_t, 1>& src) {
+    auto src128 = _mm256_castsi256_si128(src[0]);
+    return Vectorized<int32_t>(_mm256_cvtepu8_epi32(src128));
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    int64_t,
+    2,
+    typename std::enable_if<
+        std::is_same_v<dst_t, int8_t> ||
+        std::is_same_v<dst_t, uint8_t>>::type> {
+  static inline VectorizedN<dst_t, 1> apply(
+      const VectorizedN<int64_t, 2>& src) {
+    return VecConvert<dst_t, 1, int32_t, 1>::apply(
+        VecConvert<int32_t, 1, int64_t, 2>::apply(src));
+  }
+};
+
+#endif
+
+template <typename src_t>
+struct VecConvert<
+    float,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<is_reduced_floating_point_v<src_t>, void>> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<src_t, 1>& src) {
+    auto [res_vec1, res_vec2] = convert_to_float<src_t>(src[0]);
+    return res_vec1;
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    float,
+    1,
+    typename std::enable_if_t<is_reduced_floating_point_v<dst_t>, void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 1>& src) {
+    return convert_from_float<dst_t>(src[0], src[0]);
+  }
+};
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
@ -6,7 +6,8 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>
-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif

@ -15,7 +16,7 @@ namespace at::vec {
 inline namespace CPU_CAPABILITY {


-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)

 template <> class Vectorized<double> {
 private:
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
@ -6,7 +6,8 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>
-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif

@ -14,7 +15,7 @@ namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)

 template <> class Vectorized<float> {
 private:
@ -226,14 +227,14 @@ public:
    static __m256 vec_factorial_5 =
        _mm256_set1_ps(0.00828929059f); // 1/factorial(5)
    static __m256 vec_exp_log2ef =
-        (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e)
+        _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
    static __m256 vec_half = _mm256_set1_ps(0.5f);
    static __m256 vec_one = _mm256_set1_ps(1.f);
    static __m256 vec_zero = _mm256_set1_ps(0.f);
    static __m256 vec_two = _mm256_set1_ps(2.f);
-    static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2)
-    static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50);
-    static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218);
+    static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2)
+    static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
+    static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
    static __m256i vec_127 = _mm256_set1_epi32(0x0000007f);
    static int n_mantissa_bits = 23;

@ -266,7 +267,7 @@ public:
    auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number);
    auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127);
    vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
-    auto vec_two_pow_n = (__m256)vec_two_pow_n_i;
+    auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i);
    vec_two_pow_n =
        _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask);

--- a/aten/src/ATen/cpu/vec/vec256/vec256_mask.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_mask.h
@ -0,0 +1,93 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec_mask.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+
+template <typename T, typename mask_t>
+struct VecMaskLoad<
+    T,
+    1,
+    mask_t,
+    1,
+    typename std::enable_if_t<
+        std::is_same_v<T, float> || std::is_same_v<T, int32_t> ||
+            std::is_same_v<T, uint32_t>,
+        void>> {
+  static inline VectorizedN<T, 1> apply(
+      const T* ptr,
+      const VecMask<mask_t, 1>& vec_mask) {
+    auto int_mask = vec_mask.template cast<int, 1>()[0];
+    if constexpr (std::is_same_v<T, float>) {
+      return Vectorized<T>(_mm256_maskload_ps(ptr, int_mask));
+    } else {
+      return Vectorized<T>(_mm256_maskload_epi32(ptr, int_mask));
+    }
+  }
+};
+
+// TODO: add specialization of VecMaskLoad for bfloat16/half and int8/uint8
+
+template <>
+struct VecMaskCast<float, 1, int, 1> {
+  static inline VecMask<float, 1> apply(const VecMask<int, 1>& vec_mask) {
+    return Vectorized<float>(_mm256_castsi256_ps(vec_mask[0]));
+  }
+};
+
+template <>
+struct VecMaskCast<int, 1, float, 1> {
+  static inline VecMask<int, 1> apply(const VecMask<float, 1>& vec_mask) {
+    return Vectorized<int>(_mm256_castps_si256(vec_mask[0]));
+  }
+};
+
+template <typename dst_t>
+struct VecMaskCast<dst_t, 1, int64_t, 2> {
+  static inline VecMask<dst_t, 1> apply(const VecMask<int64_t, 2>& vec_mask) {
+    auto int_vec = convert<int, 1, int64_t, 2>(VectorizedN<int64_t, 2>(vec_mask));
+    return VecMask<int, 1>(int_vec).cast<dst_t, 1>();
+  }
+};
+
+template <>
+inline bool VecMask<int, 1>::all_zero() const {
+  return _mm256_testz_si256(mask_[0], mask_[0]);
+}
+
+template <>
+inline bool VecMask<int, 1>::is_masked(int i) const {
+  return _mm256_movemask_ps(_mm256_castsi256_ps(mask_[0])) & (1 << i);
+}
+
+template <>
+inline bool VecMask<int, 1>::all_masked() const {
+  int mask = _mm256_movemask_ps(_mm256_castsi256_ps(mask_[0]));
+  return mask == 0xff;
+}
+
+#define VEC_MASK_METHOD_WITH_CAST_TO_INT(                   \
+    T, N, return_type, method, args_def, args)              \
+  template <>                                               \
+  inline return_type VecMask<T, N>::method args_def const { \
+    return cast<int, 1>().method args;                      \
+  }
+
+VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_zero, (), ())
+VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_zero, (), ())
+VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, is_masked, (int i), (i))
+VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, is_masked, (int i), (i))
+VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_masked, (), ())
+VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_masked, (), ())
+
+#undef VEC_MASK_DEFINE_METHOD_WITH_CAST_TO_INT
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@ -41,11 +41,17 @@
 namespace at::vec {
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)

+#ifdef _MSC_VER
+__declspec(align(64)) struct Vectorizedqi {
+ protected:
+  __m256i vals;
+#else
 struct Vectorizedqi {
 protected:
  __m256i vals __attribute__((aligned(64)));
+#endif

 public:
  Vectorizedqi() {}
@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
 }

 template <typename T>
-inline void __attribute__((always_inline)) QuantizeAvx2(
+__FORCE_INLINE void QuantizeAvx2(
    const float* src,
    T* dst,
    int len,
@ -1331,5 +1337,5 @@ Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const V
  return a.maximum(b);
 }

-#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#endif // if defined(CPU_CAPABILITY_AVX2)
 }} // namespace at::vec::CPU_CAPABILITY
--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
@ -13,6 +13,8 @@
 #include <ATen/cpu/vec/vec512/vec512_qint.h>
 #include <ATen/cpu/vec/vec512/vec512_complex_float.h>
 #include <ATen/cpu/vec/vec512/vec512_complex_double.h>
+#include <ATen/cpu/vec/vec512/vec512_convert.h>
+#include <ATen/cpu/vec/vec512/vec512_mask.h>

 #include <algorithm>
 #include <cstddef>
@ -55,7 +57,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
 }


-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -80,7 +82,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
+#ifndef _MSC_VER
+// MSVC is not working well on complex function overload.
 template<int64_t scale = 1>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
 inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
@ -92,9 +95,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
 inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
  return _mm512_i32gather_ps(vindex, base_addr, scale);
 }
-
+#endif
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
+#ifndef _MSC_VER
+// MSVC is not working well on complex function overload.
 template<int64_t scale = 1>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
 inline mask_gather(const Vectorized<double>& src, const double* base_addr,
@ -112,7 +116,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
  auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ);
  return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale);
 }
-
+#endif
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 template<>
@ -270,6 +274,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
  return flip8(v);
 }

-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#endif // defined(CPU_CAPABILITY_AVX512)

 }}}
--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
@ -7,7 +7,8 @@
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>

-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif

@ -16,7 +17,18 @@ namespace vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)
+
+#ifndef SLEEF_CONST
+#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
+#define SLEEF_CONST const
+#else
+#define SLEEF_CONST
+#endif
+#define SLEEF_CONST_OLD SLEEF_CONST
+#else
+#define SLEEF_CONST_OLD
+#endif

 // bfloat16 conversion
 static inline void cvtbf16_fp32(const __m256i& a, __m512& o) {
@ -100,6 +112,11 @@ static inline void cvtfp16_fp32(const __m512i& a, __m512& o1, __m512& o2) {
  cvtfp16_fp32(hi, o2);
 }

+static inline __m256i cvtfp32_fp16(const __m512& src) {
+  return _mm512_cvtps_ph(
+      src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+}
+
 static inline __m512i cvtfp32_fp16(const __m512& a, const __m512& b) {
  __m256i lo = _mm512_cvtps_ph(
      a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
@ -362,7 +379,8 @@ public:
  }
  #pragma clang diagnostic push
  #pragma clang diagnostic ignored "-Wignored-qualifiers"
-  Vectorized<T> map(const __m512 (*const vop)(__m512)) const {
+
+  Vectorized<T> map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const {
    __m512 lo, hi;
    cvt_to_fp32<T>(values, lo, hi);
    const auto o1 = vop(lo);
@ -1571,7 +1589,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
 CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
 CONVERT_VECTORIZED_INIT(Half, half);

-#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#else //defined(CPU_CAPABILITY_AVX512)

 #define CONVERT_NON_VECTORIZED_INIT(type, name) \
 inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
@ -1601,9 +1619,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
 CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
 CONVERT_NON_VECTORIZED_INIT(Half, half);

-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#endif // defined(CPU_CAPABILITY_AVX512)

-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)
 #define LOAD_FP32_VECTORIZED_INIT(type, name) \
 inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
  auto values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)); \
@ -1622,7 +1640,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
 LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
 LOAD_FP32_VECTORIZED_INIT(Half, fp16);

-#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#else // defined(CPU_CAPABILITY_AVX512)
 #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
 inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
  __at_align__ float values[Vectorized<float>::size()]; \
--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
@ -7,7 +7,8 @@
 #include <c10/util/irange.h>
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif

@ -16,7 +17,7 @@ namespace vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)

 template <> class Vectorized<c10::complex<double>> {
 private:
@ -203,7 +204,7 @@ public:
    auto abs = abs_();
    auto zero = _mm512_setzero_pd();
    auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ);
-    auto div = values / abs;
+    auto div = _mm512_div_pd(values, abs);
    return _mm512_mask_blend_pd(mask, div, zero);
  }
  __m512d real_() const {
--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
@ -7,7 +7,8 @@
 #include <c10/util/irange.h>
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif

@ -16,7 +17,7 @@ namespace vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)

 template <> class Vectorized<c10::complex<float>> {
 private:
@ -708,7 +709,7 @@ public:
    auto abs = abs_();
    auto zero = _mm512_setzero_ps();
    auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ);
-    auto div = values / abs;
+    auto div = _mm512_div_ps(values, abs);
    return _mm512_mask_blend_ps(mask, div, zero);
  }
  __m512 real_() const {
--- a/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
@ -0,0 +1,139 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec512/vec512_bfloat16.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec_convert.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+template <>
+struct VecConvert<float, 1, BFloat16, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<BFloat16, 1>& src) {
+    VectorizedN<float, 1> result;
+    __m512 value;
+    cvtbf16_fp32(_mm512_castsi512_si256(src[0]), value);
+    result[0] = value;
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<float, 1, Half, 1> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<Half, 1>& src) {
+    VectorizedN<float, 1> result;
+    __m512 value;
+    cvtfp16_fp32(_mm512_castsi512_si256(src[0]), value);
+    result[0] = value;
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<BFloat16, 1, float, 1> {
+  static inline VectorizedN<BFloat16, 1> apply(
+      const VectorizedN<float, 1>& src) {
+    VectorizedN<BFloat16, 1> result;
+    result[0] = _mm512_castsi256_si512(cvtfp32_bf16(src[0]));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<Half, 1, float, 1> {
+  static inline VectorizedN<Half, 1> apply(const VectorizedN<float, 1>& src) {
+    VectorizedN<Half, 1> result;
+    result[0] = _mm512_castsi256_si512(cvtfp32_fp16(src[0]));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<float, 1, int64_t, 2> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<int64_t, 2>& src) {
+    auto low = _mm512_cvtepi64_ps(src[0]);
+    auto high = _mm512_cvtepi64_ps(src[1]);
+    return Vectorized<float>(
+        _mm512_insertf32x8(_mm512_castps256_ps512(low), high, 1));
+  }
+};
+
+template <>
+struct VecConvert<int64_t, 2, float, 1> {
+  static inline VectorizedN<int64_t, 2> apply(
+      const VectorizedN<float, 1>& src) {
+    at::vec::VectorizedN<int64_t, 2> result;
+    result[0] = _mm512_cvt_roundps_epi64(
+        _mm512_castps512_ps256(src[0]), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+    result[1] = _mm512_cvt_roundps_epi64(
+        _mm512_extractf32x8_ps(src[0], 1),
+        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, int64_t, 2> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<int64_t, 2>& src) {
+    auto low = _mm512_cvtepi64_epi32(src[0]);
+    auto high = _mm512_cvtepi64_epi32(src[1]);
+    return Vectorized<int32_t>(
+        _mm512_inserti32x8(_mm512_castsi256_si512(low), high, 1));
+  }
+};
+
+template <>
+struct VecConvert<int64_t, 2, int32_t, 1> {
+  static inline VectorizedN<int64_t, 2> apply(
+      const VectorizedN<int32_t, 1>& src) {
+    at::vec::VectorizedN<int64_t, 2> result;
+    result[0] = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(src[0]));
+    result[1] = _mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(src[0], 1));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, int8_t, 1> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<int8_t, 1>& src) {
+    auto src128 = _mm512_castsi512_si128(src[0]);
+    return Vectorized<int32_t>(_mm512_cvtepi8_epi32(src128));
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, uint8_t, 1> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<uint8_t, 1>& src) {
+    auto src128 = _mm512_castsi512_si128(src[0]);
+    return Vectorized<int32_t>(_mm512_cvtepu8_epi32(src128));
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    int64_t,
+    2,
+    typename std::enable_if<
+        std::is_same_v<dst_t, int8_t> ||
+        std::is_same_v<dst_t, uint8_t>>::type> {
+  static inline VectorizedN<dst_t, 1> apply(
+      const VectorizedN<int64_t, 2>& src) {
+    return VecConvert<dst_t, 1, int32_t, 1>::apply(
+        VecConvert<int32_t, 1, int64_t, 2>::apply(src));
+  }
+};
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
@ -6,7 +6,8 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>
-#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
+#if (defined(CPU_CAPABILITY_AVX512))
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif

@ -15,7 +16,7 @@ namespace vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)

 template <> class Vectorized<double> {
 private:
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
@ -6,7 +6,8 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>
-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif

@ -15,7 +16,7 @@ namespace vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)

 template <> class Vectorized<float> {
 private:
@ -246,14 +247,14 @@ public:
    static __m512 vec_factorial_5 =
        _mm512_set1_ps(0.00828929059f); // 1/factorial(5)
    static __m512 vec_exp_log2ef =
-        (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e)
+        _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
    static __m512 vec_half = _mm512_set1_ps(0.5f);
    static __m512 vec_one = _mm512_set1_ps(1.f);
    static __m512 vec_zero = _mm512_set1_ps(0.f);
    static __m512 vec_two = _mm512_set1_ps(2.f);
-    static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2)
-    static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50);
-    static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218);
+    static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
+    static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
+    static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
    static __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
    static int n_mantissa_bits = 23;

@ -288,7 +289,7 @@ public:
    auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number);
    auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127);
    vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
-    auto vec_two_pow_n = (__m512)vec_two_pow_n_i;
+    auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i);
    vec_two_pow_n =
        _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero);

--- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
@ -1069,7 +1069,7 @@ Vectorized<int8_t> inline maximum(const Vectorized<int8_t>& a, const Vectorized<

 template <>
 Vectorized<uint8_t> inline maximum(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
-  return _mm512_max_epi8(a, b);
+  return _mm512_max_epu8(a, b);
 }

 template <>
--- a/aten/src/ATen/cpu/vec/vec512/vec512_mask.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_mask.h
@ -0,0 +1,155 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec_mask.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+template <typename T, typename mask_t>
+struct VecMaskLoad<
+    T,
+    1,
+    mask_t,
+    1,
+    typename std::enable_if_t<
+        std::is_same_v<T, float> || std::is_same_v<T, int32_t> ||
+            std::is_same_v<T, uint32_t>,
+        void>> {
+  static inline VectorizedN<T, 1> apply(
+      const T* ptr,
+      const VecMask<mask_t, 1>& vec_mask) {
+    at::vec::Vectorized<T> zero_vec(0);
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    auto int_mask = vec_mask.template cast<int, 1>()[0];
+    auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ);
+    if constexpr (std::is_same_v<T, float>) {
+      return Vectorized<T>(_mm512_mask_loadu_ps(zero_vec, mmask, ptr));
+    } else {
+      return Vectorized<T>(_mm512_mask_loadu_epi32(zero_vec, mmask, ptr));
+    }
+  }
+};
+
+template <typename data_t, typename mask_t>
+struct VecMaskLoad<
+    data_t,
+    1,
+    mask_t,
+    1,
+    typename std::enable_if<
+        std::is_same_v<data_t, BFloat16> ||
+        std::is_same_v<data_t, Half>>::type> {
+  static inline VectorizedN<data_t, 1> apply(
+      const data_t* ptr,
+      const VecMask<mask_t, 1>& vec_mask) {
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    auto int_mask = vec_mask.template cast<int, 1>()[0];
+    auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ);
+    auto zero = _mm256_set1_epi16(0);
+    auto temp = _mm256_mask_loadu_epi16(zero, mmask, ptr);
+    return Vectorized<data_t>(
+        _mm512_inserti32x8(_mm512_castsi256_si512(temp), zero, 1));
+  }
+};
+
+template <typename data_t, typename mask_t>
+struct VecMaskLoad<
+    data_t,
+    1,
+    mask_t,
+    1,
+    typename std::enable_if<
+        std::is_same_v<data_t, int8_t> ||
+        std::is_same_v<data_t, uint8_t>>::type> {
+  static inline VectorizedN<data_t, 1> apply(
+      const data_t* ptr,
+      const VecMask<mask_t, 1>& vec_mask) {
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    auto int_mask = vec_mask.template cast<int, 1>()[0];
+    auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ);
+    auto zero = _mm_set1_epi8(0);
+    auto temp = _mm_mask_loadu_epi8(zero, mmask, ptr);
+    return Vectorized<data_t>(
+        _mm512_inserti64x2(_mm512_set1_epi32(0), temp, 0));
+  }
+};
+
+template <typename mask_t>
+struct VecMaskLoad<int64_t, 2, mask_t, 1> {
+  static inline VectorizedN<int64_t, 2> apply(
+      const int64_t* ptr,
+      const VecMask<mask_t, 1>& vec_mask) {
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    auto zero = _mm512_set1_epi64(0);
+    auto int_mask = vec_mask.template cast<int, 1>()[0];
+    auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ);
+    at::vec::VectorizedN<int64_t, 2> result;
+    result[0] = _mm512_mask_loadu_epi64(zero, (__mmask8)mmask, ptr);
+    result[1] = _mm512_mask_loadu_epi64(zero, (__mmask8)(mmask >> 8), ptr + 8);
+    return result;
+  }
+};
+
+template <>
+struct VecMaskCast<float, 1, int, 1> {
+  static inline VecMask<float, 1> apply(const VecMask<int, 1>& vec_mask) {
+    return Vectorized<float>(_mm512_castsi512_ps(vec_mask[0]));
+  }
+};
+
+template <>
+struct VecMaskCast<int, 1, float, 1> {
+  static inline VecMask<int, 1> apply(const VecMask<float, 1>& vec_mask) {
+    return Vectorized<int>(_mm512_castps_si512(vec_mask[0]));
+  }
+};
+
+template <typename dst_t>
+struct VecMaskCast<dst_t, 1, int64_t, 2> {
+  static inline VecMask<dst_t, 1> apply(const VecMask<int64_t, 2>& vec_mask) {
+    auto int_vec = convert<int, 1, int64_t, 2>(VectorizedN<int64_t, 2>(vec_mask));
+    return VecMask<int, 1>(int_vec).cast<dst_t, 1>();
+  }
+};
+
+template <>
+inline bool VecMask<int, 1>::all_zero() const {
+  __mmask16 mask = _mm512_test_epi32_mask(mask_[0], mask_[0]);
+  return mask == 0;
+}
+
+template <>
+inline bool VecMask<int, 1>::is_masked(int i) const {
+  return _mm512_movepi32_mask(mask_[0]) & (1 << i);
+}
+
+template <>
+inline bool VecMask<int, 1>::all_masked() const {
+  __mmask16 mask = _mm512_movepi32_mask(mask_[0]);
+  return mask == 0xffff;
+}
+
+#define VEC_MASK_METHOD_WITH_CAST_TO_INT(                   \
+    T, N, return_type, method, args_def, args)              \
+  template <>                                               \
+  inline return_type VecMask<T, N>::method args_def const { \
+    return cast<int, 1>().method args;                      \
+  }
+
+VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_zero, (), ())
+VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_zero, (), ())
+VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, is_masked, (int i), (i))
+VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, is_masked, (int i), (i))
+VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_masked, (), ())
+VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_masked, (), ())
+
+#undef VEC_MASK_DEFINE_METHOD_WITH_CAST_TO_INT
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
@ -42,11 +42,17 @@ namespace at {
 namespace vec {
 inline namespace CPU_CAPABILITY {

-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)

+#ifdef _MSC_VER
+__declspec(align(64)) struct Vectorizedqi {
+ protected:
+  __m512i vals;
+#else
 struct Vectorizedqi {
 protected:
  __m512i vals __attribute__((aligned(64)));
+#endif

 public:
  Vectorizedqi() {}
@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
 }

 template <typename T>
-inline void __attribute__((always_inline)) QuantizeAvx512(
+__FORCE_INLINE void QuantizeAvx512(
    const float* src,
    T* dst,
    int len,
@ -525,10 +531,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
      Vectorized<float> scale,
      Vectorized<float> zero_point,
      Vectorized<float> scale_neg_zp_premul) const {
+    #if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+    #else
    __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
    __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
    __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
    __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+    #endif

    __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
    __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
@ -549,10 +562,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
  float_vec_return_type dequantize(
      Vectorized<float> scale,
      Vectorized<float> zero_point) const {
+    #if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+    #else
    __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
    __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
    __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
    __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+    #endif

    __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
    __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
@ -598,20 +618,34 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
    }

    int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
+      #if defined(_MSC_VER) && !defined(__clang__)
+      __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+      __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+      __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+      __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+      #else
      __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
      __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
      __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
      __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+      #endif

      __m512i int32_val0 = cvtepi8_epi32(int_val0);
      __m512i int32_val1 = cvtepi8_epi32(int_val1);
      __m512i int32_val2 = cvtepi8_epi32(int_val2);
      __m512i int32_val3 = cvtepi8_epi32(int_val3);

+      #if defined(_MSC_VER) && !defined(__clang__)
+      __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
+      __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
+      __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
+      __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
+      #else
      __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
      __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
      __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
      __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
+      #endif

      __m512i int32_b0 = cvtepi8_epi32(int_b0);
      __m512i int32_b1 = cvtepi8_epi32(int_b1);
@ -721,10 +755,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
      Vectorized<float> scale,
      Vectorized<float> zero_point,
      Vectorized<float> scale_zp_premul) const {
+    #if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+    #else
    __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
    __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
    __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
    __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+    #endif

    __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
    __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
@ -746,10 +787,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
  float_vec_return_type dequantize(
      Vectorized<float> scale,
      Vectorized<float> zero_point) const {
+    #if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+    #else
    __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
    __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
    __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
    __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+    #endif

    __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
    __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
@ -796,20 +844,34 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
    }

    int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
+      #if defined(_MSC_VER) && !defined(__clang__)
+      __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+      __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+      __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+      __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+      #else
      __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
      __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
      __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
      __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+      #endif

      __m512i int32_val0 = cvtepu8_epi32(int_val0);
      __m512i int32_val1 = cvtepu8_epi32(int_val1);
      __m512i int32_val2 = cvtepu8_epi32(int_val2);
      __m512i int32_val3 = cvtepu8_epi32(int_val3);

+      #if defined(_MSC_VER) && !defined(__clang__)
+      __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
+      __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
+      __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
+      __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
+      #else
      __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
      __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
      __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
      __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
+      #endif

      __m512i int32_b0 = cvtepu8_epi32(int_b0);
      __m512i int32_b1 = cvtepu8_epi32(int_b1);
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@ -36,6 +36,12 @@
 #include <c10/util/irange.h>
 #include <c10/util/Load.h>

+#if defined(__GNUC__)
+#define __FORCE_INLINE __attribute__((always_inline)) inline
+#elif defined(_MSC_VER)
+#define __FORCE_INLINE __forceinline
+#endif
+
 // These macros helped us unify vec_base.h
 #ifdef CPU_CAPABILITY_AVX512
 #if defined(__GNUC__)
@ -228,6 +234,11 @@ public:
    std::memcpy(vector.values, ptr, count * sizeof(T));
    return vector;
  }
+  static Vectorized<T> loadu_one_fourth(const void* ptr) {
+    static_assert(std::is_same_v<T, signed char> || std::is_same_v<T, unsigned char>, "For byte types only");
+    return Vectorized::loadu(ptr, 8);
+  }
+
  void store(void* ptr, int count = size()) const {
    std::memcpy(ptr, values, count * sizeof(T));
  }
@ -835,8 +846,8 @@ inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) {

 template<class T, typename std::enable_if_t<!std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
 inline Vectorized<T> operator~(const Vectorized<T>& a) {
-  Vectorized<T> ones;  // All bits are 1
-  memset((T*) ones, 0xFF, VECTOR_WIDTH);
+  using int_t = int_same_size_t<T>;
+  Vectorized<T> ones(c10::bit_cast<T>((int_t)(~(int_t)0)));  // All bits are 1
  return a ^ ones;
 }

@ -1106,3 +1117,8 @@ inline void transpose_mxn(const T* src, int64_t ld_src, T* dst, int64_t ld_dst)
 }

 }} // namespace at::vec::CPU_CAPABILITY
+
+// additional headers for more operations that depend on vec_base
+#include <ATen/cpu/vec/vec_n.h>
+#include <ATen/cpu/vec/vec_mask.h>
+#include <ATen/cpu/vec/vec_convert.h>
--- a/aten/src/ATen/cpu/vec/vec_convert.h
+++ b/aten/src/ATen/cpu/vec/vec_convert.h
@ -0,0 +1,56 @@
+#pragma once
+
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec_n.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+template <
+    typename dst_t,
+    int dst_n,
+    typename src_t,
+    int src_n,
+    typename Enabled = void>
+struct VecConvert {
+  static inline VectorizedN<dst_t, dst_n> apply(
+      const VectorizedN<src_t, src_n>& src) {
+    constexpr int count = std::min(
+        VectorizedN<src_t, src_n>::size(), VectorizedN<dst_t, dst_n>::size());
+    __at_align__ src_t src_buf[VectorizedN<src_t, src_n>::size()];
+    src.store(src_buf);
+    __at_align__ dst_t dst_buf[VectorizedN<dst_t, dst_n>::size()];
+    for (int i = 0; i < count; i++) {
+      dst_buf[i] = static_cast<dst_t>(src_buf[i]);
+    }
+    return VectorizedN<dst_t, dst_n>::loadu(dst_buf, count);
+  }
+};
+
+template <typename dst_t, typename src_t>
+inline Vectorized<dst_t> convert(const Vectorized<src_t>& src) {
+  return VecConvert<dst_t, 1, src_t, 1>::apply(src);
+}
+
+template <
+    typename dst_t,
+    int dst_n,
+    typename src_t,
+    int src_n,
+    std::enable_if_t<dst_n != 1, int> = 0>
+inline VectorizedN<dst_t, dst_n> convert(const VectorizedN<src_t, src_n>& src) {
+  return VecConvert<dst_t, dst_n, src_t, src_n>::apply(src);
+}
+
+template <
+    typename dst_t,
+    int dst_n,
+    typename src_t,
+    int src_n,
+    std::enable_if_t<dst_n == 1, int> = 0>
+inline Vectorized<dst_t> convert(const VectorizedN<src_t, src_n>& src) {
+  return VecConvert<dst_t, dst_n, src_t, src_n>::apply(src);
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec_mask.h
+++ b/aten/src/ATen/cpu/vec/vec_mask.h
@ -0,0 +1,248 @@
+#pragma once
+
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec_n.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+/**
+ * The `VecMask` class provides a convenient interface for working with
+ * vectorized masks in SIMD operations. It encapsulates a `Vectorized<T, N>`
+ * mask that can be directly usable in masked vectorized operations. It provides
+ * various methods for manipulating and accessing the mask elements:
+ * 1. `from` and `to`: Conversion between a vector of boolean values and a
+ * vectorized mask.
+ * 2. `cast`: Casts the mask to a different base type.
+ * 3. `all_zero`: Checks if all mask elements are zero.
+ * 4. `is_masked`: Checks if a specific element is masked.
+ * 5. `loadu`: Loads data from memory using the mask.
+ * 6. `all_masked`: Checks if all mask elements are masked.
+ *
+ * Some helper template classes are provided to simplify the specialization of
+ * the `VecMask` for the specific CPU arch:
+ * 1. `VecMaskLoad`: Loads data from memory using the mask.
+ * 2. `VecMaskTo`: Converts the mask to boolean.
+ * 3. `VecMaskCast`: Casts the mask to a different base type.
+ *
+ */
+template <typename T, int N>
+class VecMask;
+
+template <
+    typename data_t,
+    int data_n,
+    typename mask_t,
+    int mask_n,
+    typename Enabled = void>
+struct VecMaskLoad {
+  static inline VectorizedN<data_t, data_n> apply(
+      const data_t* ptr,
+      const VecMask<mask_t, mask_n>& vec_mask) {
+    constexpr typename VecMask<mask_t, mask_n>::size_type size =
+        VecMask<mask_t, mask_n>::size();
+    static_assert(VectorizedN<data_t, data_n>::size() >= size);
+    __at_align__ data_t data[size];
+    __at_align__ mask_t mask[size];
+    auto mask_ = VectorizedN<mask_t, mask_n>(vec_mask);
+    mask_.store(mask);
+    for (int i = 0; i < size; i++) {
+      data[i] = mask[i] ? ptr[i] : static_cast<data_t>(0);
+    }
+    return VectorizedN<data_t, data_n>::loadu(data, size);
+  }
+};
+
+template <
+    typename dst_t,
+    int dst_n,
+    typename src_t,
+    int src_n,
+    typename Enabled = void>
+struct VecMaskTo {
+  static inline VecMask<dst_t, dst_n> apply(
+      const VecMask<src_t, src_n>& vec_mask) {
+    auto zeros = VectorizedN<dst_t, dst_n>(static_cast<dst_t>(0));
+    auto ones = VectorizedN<dst_t, dst_n>(static_cast<dst_t>(1));
+    return VectorizedN<dst_t, dst_n>::blendv(
+        zeros, ones, vec_mask.template cast<dst_t, dst_n>());
+  }
+};
+
+template <typename dst_t, int dst_n, typename src_t, int src_n>
+struct VecMaskCast {
+  static inline VecMask<dst_t, dst_n> apply(
+      const VecMask<src_t, src_n>& vec_mask) {
+    return VecMask<dst_t, dst_n>::from(VectorizedN<src_t, src_n>(vec_mask));
+  }
+};
+
+template <typename T, int N>
+struct VecMaskCast<T, N, T, N> {
+  static inline VecMask<T, N> apply(const VecMask<T, N>& vec_mask) {
+    return vec_mask;
+  }
+};
+
+template <typename T, int N>
+class VecMask {
+ public:
+  using size_type = int;
+  static constexpr size_type size() {
+    return VectorizedN<T, N>::size();
+  }
+
+ private:
+  VectorizedN<T, N> mask_;
+
+ public:
+  VecMask() : mask_(static_cast<T>(0)) {}
+  VecMask(const VectorizedN<T, N>& mask) : mask_(mask) {}
+
+  template <int L = N, typename std::enable_if_t<L == 1, int> = 0>
+  VecMask(const Vectorized<T>& mask) : mask_(mask) {}
+
+  template <typename U, int L>
+  static VecMask<T, N> from(const VectorizedN<U, L>& b_vec) {
+    __at_align__ U b_buf[size()];
+    if constexpr (size() >= VectorizedN<U, L>::size()) {
+      b_vec.store(b_buf);
+      for (int i = VectorizedN<U, L>::size(); i < size(); i++) {
+        b_buf[i] = static_cast<U>(0);
+      }
+    } else {
+      b_vec.store(b_buf, size());
+    }
+    return from(b_buf);
+  }
+
+  template <typename U>
+  static VecMask<T, N> from(U b) {
+    using int_t = int_same_size_t<T>;
+    T mask = b ? c10::bit_cast<T>((int_t)(~(int_t)0)) : (T)0;
+    return VectorizedN<T, N>(mask);
+  }
+
+  template <typename U>
+  static VecMask<T, N> from(U* b) {
+    using int_t = int_same_size_t<T>;
+    __at_align__ T mask[size()];
+#pragma unroll
+    for (int i = 0; i < size(); i++) {
+      *(int_t*)(mask + i) = b[i] ? ~(int_t)0 : (int_t)0;
+    }
+    return VectorizedN<T, N>(VectorizedN<T, N>::loadu(mask));
+  }
+
+  template <typename U, int L, std::enable_if_t<L >= 2, int> = 0>
+  inline VectorizedN<U, L> to() const {
+    return VecMaskTo<U, L, T, N>::apply(*this);
+  }
+
+  template <typename U, int L, std::enable_if_t<L == 1, int> = 0>
+  inline Vectorized<U> to() const {
+    return VecMaskTo<U, L, T, N>::apply(*this);
+  }
+
+  template <typename U, int L>
+  inline VecMask<U, L> cast() const {
+    return VecMaskCast<U, L, T, N>::apply(*this);
+  }
+
+  inline bool all_zero() const {
+    __at_align__ T mask[size()];
+    mask_.store(mask);
+    return std::all_of(
+        mask, mask + size(), [](T m) { return m == static_cast<T>(0); });
+  }
+
+  inline bool all_masked() const {
+    __at_align__ T mask[size()];
+    mask_.store(mask);
+    return std::all_of(
+        mask, mask + size(), [](T m) { return m != static_cast<T>(0); });
+  }
+
+  inline bool is_masked(int i) const {
+    __at_align__ T mask[size()];
+    mask_.store(mask);
+    return mask[i] != static_cast<T>(0);
+  }
+
+  inline operator VectorizedN<T, N>() const {
+    return mask_;
+  }
+
+  template <int L = N, typename std::enable_if_t<L == 1, int> = 0>
+  inline operator Vectorized<T>() const {
+    return mask_[0];
+  }
+
+  inline Vectorized<T> operator[](int i) const {
+    return mask_[i];
+  }
+
+  template <
+      typename U,
+      int L,
+      std::enable_if_t<L >= 2 && VectorizedN<U, L>::size() >= size(), int> = 0>
+  VectorizedN<U, L> loadu(const U* ptr) const {
+    return VecMaskLoad<U, L, T, N>::apply(ptr, *this);
+  }
+
+  template <
+      typename U,
+      int L,
+      std::enable_if_t<L == 1 && Vectorized<U>::size() >= size(), int> = 0>
+  Vectorized<U> loadu(const U* ptr) const {
+    return VecMaskLoad<U, L, T, N>::apply(ptr, *this);
+  }
+};
+
+#define VEC_MASK_DEFINE_UNARY_OP_GLOBAL(op)         \
+  template <typename T, int N>                      \
+  inline VecMask<T, N> op(const VecMask<T, N>& a) { \
+    return op(VectorizedN<T, N>(a));                \
+  }
+
+#define VEC_MASK_DEFINE_BINARY_OP_GLOBAL(op)                                  \
+  template <                                                                  \
+      typename T,                                                             \
+      int N,                                                                  \
+      typename V,                                                             \
+      int M,                                                                  \
+      std::enable_if_t<VecMask<T, N>::size() == VecMask<V, M>::size(), int> = \
+          0>                                                                  \
+  inline VecMask<T, N> op(const VecMask<T, N>& a, const VecMask<V, M>& b) {   \
+    return op(                                                                \
+        VectorizedN<T, N>(a), VectorizedN<T, N>(b.template cast<T, N>()));    \
+  }
+
+#define VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(op, EXPR)                  \
+  template <                                                                  \
+      typename T,                                                             \
+      int N,                                                                  \
+      typename V,                                                             \
+      int M,                                                                  \
+      std::enable_if_t<VecMask<T, N>::size() == VecMask<V, M>::size(), int> = \
+          0>                                                                  \
+  inline VecMask<T, N> op(const VecMask<T, N>& a, const VecMask<V, M>& b) {   \
+    return EXPR;                                                              \
+  }
+
+VEC_MASK_DEFINE_UNARY_OP_GLOBAL(operator~)
+VEC_MASK_DEFINE_BINARY_OP_GLOBAL(operator&)
+VEC_MASK_DEFINE_BINARY_OP_GLOBAL(operator|)
+VEC_MASK_DEFINE_BINARY_OP_GLOBAL(operator^)
+VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator>, a & ~b)
+VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator<, ~a& b)
+VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator==, ~(a ^ b))
+VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator>=, (a == b) | (a > b))
+VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator<=, (a == b) | (a < b))
+
+#undef VEC_MASK_DEFINE_UNARY_OP_GLOBAL
+#undef VEC_MASK_DEFINE_BINARY_OP_GLOBAL
+#undef VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec_n.h
+++ b/aten/src/ATen/cpu/vec/vec_n.h
@ -1,3 +1,5 @@
+#pragma once
+
 #include <ATen/cpu/vec/vec_base.h>
 #include <array>

@ -83,11 +85,19 @@ class VectorizedN {
    }
  }

-  const Vectorized<T>& operator[](int i) const {
+  template <int L = N, typename std::enable_if_t<L == 1, int> = 0>
+  VectorizedN(const Vectorized<T>& val) : values({val}) {}
+
+  template <int L = N, typename std::enable_if_t<L == 1, int> = 0>
+  inline operator Vectorized<T>() const {
+    return values[0];
+  }
+
+  inline const Vectorized<T>& operator[](int i) const {
    return values[i];
  }

-  Vectorized<T>& operator[](int i) {
+  inline Vectorized<T>& operator[](int i) {
    return values[i];
  }

@ -97,7 +107,7 @@ class VectorizedN {
      const VectorizedN<T, N>& b) {
    VectorizedN<T, N> result;
    for (int i = 0; i < N; ++i) {
-      result.values[i] = Vectorized<T>::blend<mask>(a.values[i], b.values[i]);
+      result.values[i] = Vectorized<T>::template blend<mask>(a.values[i], b.values[i]);
    }
    return result;
  }
@ -132,8 +142,10 @@ class VectorizedN {
      int64_t count = size()) {
    VectorizedN<T, N> result;
    for (int i = 0; i < N; ++i) {
-      result.values[i] =
-          Vectorized<T>::set(a.values[i], b.values[i], std::min(count, Vectorized<T>::size()));
+      result.values[i] = Vectorized<T>::set(
+          a.values[i],
+          b.values[i],
+          std::min(count, (int64_t)Vectorized<T>::size()));
      count -= Vectorized<T>::size();
      if (count <= 0) {
        break;
@ -154,8 +166,8 @@ class VectorizedN {
  static VectorizedN<T, N> loadu(const void* ptr, int64_t count) {
    VectorizedN<T, N> result;
    for (int i = 0; i < N; ++i) {
-      result.values[i] =
-          Vectorized<T>::loadu(ptr, std::min(count, Vectorized<T>::size()));
+      result.values[i] = Vectorized<T>::loadu(
+          ptr, std::min(count, (int64_t)Vectorized<T>::size()));
      ptr = static_cast<const T*>(ptr) + Vectorized<T>::size();
      count -= Vectorized<T>::size();
      if (count <= 0) {
@ -174,7 +186,7 @@ class VectorizedN {

  void store(void* ptr, int count) const {
    for (int i = 0; i < N; ++i) {
-      values[i].store(ptr, std::min(count, Vectorized<T>::size()));
+      values[i].store(ptr, std::min(count, (int)Vectorized<T>::size()));
      ptr = static_cast<T*>(ptr) + Vectorized<T>::size();
      count -= Vectorized<T>::size();
      if (count <= 0) {
@ -341,4 +353,4 @@ inline T vec_reduce_all(const OpVec& vec_fun, VectorizedN<T, N> acc_vec) {
 }

 } // namespace CPU_CAPABILITY
-} // namespace at::vec
+} // namespace at::vec
--- a/aten/src/ATen/cuda/CUDAEvent.h
+++ b/aten/src/ATen/cuda/CUDAEvent.h
@ -48,7 +48,7 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
        CUDAGuard guard(device_index_);
        const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
        if (C10_UNLIKELY(interp)) {
-          (*interp)->trace_gpu_event_deletion(reinterpret_cast<uintptr_t>(event_));
+          (*interp)->trace_gpu_event_deletion(at::kCUDA, reinterpret_cast<uintptr_t>(event_));
        }
        AT_CUDA_CHECK(cudaEventDestroy(event_));
      }
@ -122,7 +122,7 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
    AT_CUDA_CHECK(cudaEventRecord(event_, stream));
    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
    if (C10_UNLIKELY(interp)) {
-      (*interp)->trace_gpu_event_record(
+      (*interp)->trace_gpu_event_record(at::kCUDA,
          reinterpret_cast<uintptr_t>(event_),
          reinterpret_cast<uintptr_t>(stream.stream())
      );
@ -138,7 +138,7 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
      AT_CUDA_CHECK(cudaStreamWaitEvent(stream, event_, 0));
      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
      if (C10_UNLIKELY(interp)) {
-        (*interp)->trace_gpu_event_wait(
+        (*interp)->trace_gpu_event_wait(at::kCUDA,
            reinterpret_cast<uintptr_t>(event_),
            reinterpret_cast<uintptr_t>(stream.stream())
        );
@ -165,7 +165,7 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
    if (is_created_) {
      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
      if (C10_UNLIKELY(interp)) {
-          (*interp)->trace_gpu_event_synchronization(reinterpret_cast<uintptr_t>(event_));
+          (*interp)->trace_gpu_event_synchronization(at::kCUDA, reinterpret_cast<uintptr_t>(event_));
      }
      AT_CUDA_CHECK(cudaEventSynchronize(event_));
    }
@ -195,7 +195,7 @@ private:
    AT_CUDA_CHECK(cudaEventCreateWithFlags(&event_, flags_));
    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
    if (C10_UNLIKELY(interp)) {
-      (*interp)->trace_gpu_event_creation(reinterpret_cast<uintptr_t>(event_));
+      (*interp)->trace_gpu_event_creation(at::kCUDA, reinterpret_cast<uintptr_t>(event_));
    }
    is_created_ = true;
  }
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@ -1,5 +1,8 @@
+#include <ATen/Functions.h>
+#include <ATen/Tensor.h>
 #include <ATen/Utils.h>
 #include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <ATen/cuda/CUDAGraph.h>
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
 #include <c10/core/StreamGuard.h>
 #include <c10/cuda/CUDAFunctions.h>
@ -24,10 +27,10 @@ static std::deque<c10::once_flag> cuda_gens_init_flag;
 static std::vector<Generator> default_gens_cuda;

 /*
-* Populates the global variables related to CUDA generators
-* Warning: this function must only be called once!
-*/
-static void initCUDAGenVector(){
+ * Populates the global variables related to CUDA generators
+ * Warning: this function must only be called once!
+ */
+static void initCUDAGenVector() {
  num_gpus = c10::cuda::device_count();
  cuda_gens_init_flag.resize(num_gpus);
  default_gens_cuda.resize(num_gpus);
@ -77,6 +80,150 @@ Generator createCUDAGenerator(DeviceIndex device_index) {

 } // namespace cuda::detail

+/**
+ * Creates a clone of this CUDA Generator State.
+ */
+c10::intrusive_ptr<CUDAGeneratorState> CUDAGeneratorState::clone() {
+  return make_intrusive<CUDAGeneratorState>(
+      seed_, philox_offset_per_thread_, offset_intragraph_);
+}
+
+/**
+ * Function to increase the internal offset based on the specified increment.
+ */
+void CUDAGeneratorState::increase(uint64_t increment) {
+  // Rounds increment up to the nearest multiple of 4 to meet alignment
+  // requirements.
+  // see Note [Why enforce RNG offset % 4 == 0?]
+  increment = ((increment + 3) / 4) * 4;
+  // Handling different behaviors based on whether capturing is active.
+  if (at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None) {
+    // Ensures that the state is actually capturing.
+    TORCH_CHECK(
+        capturing_,
+        "Attempt to increase offset for a CUDA generator not in capture mode.");
+    // Ensures the offset is a multiple of 4
+    // see Note [Why enforce RNG offset % 4 == 0?]
+    TORCH_INTERNAL_ASSERT(
+        offset_intragraph_ % 4 == 0, "RNG offset must be a multiple of 4.");
+    // Ensures the increment does not cause overflow.
+    TORCH_INTERNAL_ASSERT(
+        offset_intragraph_ <= std::numeric_limits<uint32_t>::max() - increment,
+        "Increment causes overflow in the offset value.");
+    offset_intragraph_ += increment;
+  } else {
+    // Checks that the increment is expected outside graph capturing.
+    TORCH_CHECK(
+        !capturing_,
+        "Offset increment outside graph capture encountered unexpectedly.");
+    // Ensures the offset is a multiple of 4
+    // see Note [Why enforce RNG offset % 4 == 0?]
+    TORCH_INTERNAL_ASSERT(
+        philox_offset_per_thread_ % 4 == 0,
+        "RNG offset must be a multiple of 4.");
+    philox_offset_per_thread_ += increment;
+  }
+}
+
+/**
+ * Registers this state to a CUDA graph to manage within the graph.
+ */
+void CUDAGeneratorState::register_graph(cuda::CUDAGraph* graph) {
+  // Ensures that the RNG state is not currently being captured.
+  at::cuda::assertNotCapturing(
+      "Cannot register the state during capturing stage.");
+
+  // If this is the first graph to be registered, allocate memory for the seed
+  // and offset on the GPU.
+  if (registered_graphs_.empty()) {
+    auto options = at::TensorOptions().device(at::kCUDA).dtype(at::kLong);
+    seed_extragraph_ = at::empty({1}, options);
+    offset_extragraph_ = at::empty({1}, options);
+  }
+
+  // Insert the graph into the set of registered graphs if it's not already
+  // registered.
+  if (registered_graphs_.find(graph) == registered_graphs_.end()) {
+    registered_graphs_.insert(graph);
+  }
+}
+
+/**
+ * Unregisters a CUDA graph from the RNG state.
+ */
+void CUDAGeneratorState::unregister_graph(cuda::CUDAGraph* graph) {
+  // Ensures that the RNG state is not currently being captured.
+  at::cuda::assertNotCapturing(
+      "Cannot unregister the state during capturing stage.");
+  // Verify the graph was previously registered.
+  TORCH_CHECK(
+      registered_graphs_.find(graph) != registered_graphs_.end(),
+      "The graph should be registered to the state");
+
+  // Remove the graph from the set of registered graphs.
+  registered_graphs_.erase(graph);
+
+  // If no more graphs are registered, deallocate the GPU memory for the seed
+  // and offset.
+  if (registered_graphs_.empty()) {
+    seed_extragraph_.reset();
+    offset_extragraph_.reset();
+  }
+}
+
+/**
+ * Note [Explicit Registration of Generators to the CUDA Graph]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * Ideally, it would be more user-friendly if the state could be exchanged and generators
+ * could be registered with the CUDA graph implicitly. However, resetting GPU tensors during
+ * the capture stage causes these reset operations to be recorded within the CUDA graph.
+ * This behavior is undesirable because we do not want these tensors to be reset during
+ * the replay stage of the graph.
+ *
+ * As of now, there is no available method to perform a CUDA operation during the graph's
+ * recording phase without having that operation be included in the CUDA graph.
+ * This limitation necessitates explicit user action to register generators with the graph.
+ * By requiring users to manually register their generators, we can ensure that state resets
+ * (capture_prologue) only occur before the graph capture begins, thus avoiding unintended
+ * resets during the replay of the graph. See https://github.com/pytorch/pytorch/pull/114068.
+ */
+
+/**
+ * Performs the prologue steps for capturing a CUDA graph state.
+ * This method is intended to reset graph-related state variables before capturing begins.
+ */
+void CUDAGeneratorState::capture_prologue() {
+  capturing_ = true;
+  offset_intragraph_ = 0;
+  seed_extragraph_.fill_(int64_t(seed_));
+  offset_extragraph_.fill_(int64_t(0));
+}
+
+/**
+ * Ends the capturing phase and resets related variables, returning the whole
+ * graph increment.
+ */
+uint64_t CUDAGeneratorState::capture_epilogue() {
+  capturing_ = false;
+  return offset_intragraph_;
+}
+
+/**
+ * Prepares the state for replay by setting initial state tensors and applying
+ * total increment.
+ */
+void CUDAGeneratorState::replay_prologue(uint64_t wholegraph_increment) {
+  // Ensures the generator is not in capturing mode.
+  at::cuda::assertNotCapturing(
+      "Cannot prepare for replay during capturing stage.");
+  seed_extragraph_.fill_(int64_t(seed_));
+  offset_extragraph_.fill_(int64_t(philox_offset_per_thread_));
+  // Applies the total increment achieved during previous captures to update the
+  // offset.
+  increase(wholegraph_increment);
+}
+
 /**
 * Note [Why enforce RNG offset % 4 == 0?]
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -97,8 +244,18 @@ Generator createCUDAGenerator(DeviceIndex device_index) {
 */
 CUDAGeneratorImpl::CUDAGeneratorImpl(DeviceIndex device_index)
  : c10::GeneratorImpl{Device(DeviceType::CUDA, device_index),
-              DispatchKeySet(c10::DispatchKey::CUDA)} {
+          DispatchKeySet(c10::DispatchKey::CUDA)} {
  at::cuda::assertNotCapturing("Cannot construct a new CUDAGeneratorImpl");
+  state_ = make_intrusive<CUDAGeneratorState>();
+  no_reset_rnn_state_.clear();
+}
+
+CUDAGeneratorImpl::CUDAGeneratorImpl(
+    DeviceIndex device_index,
+    c10::intrusive_ptr<CUDAGeneratorState> state)
+    : c10::
+          GeneratorImpl{Device(DeviceType::CUDA, device_index), DispatchKeySet(c10::DispatchKey::CUDA)},
+      state_(std::move(state)) {
  no_reset_rnn_state_.clear();
 }

@ -109,9 +266,10 @@ CUDAGeneratorImpl::CUDAGeneratorImpl(DeviceIndex device_index)
 * See Note [Acquire lock when using random generators]
 */
 void CUDAGeneratorImpl::set_current_seed(uint64_t seed) {
-  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::set_current_seed");
-  seed_ = seed;
-  philox_offset_per_thread_ = 0;
+  at::cuda::assertNotCapturing(
+      "Cannot call CUDAGeneratorImpl::set_current_seed");
+  state_->seed_ = seed;
+  state_->philox_offset_per_thread_ = 0;
  no_reset_rnn_state_.clear();
 }

@ -134,15 +292,9 @@ uint64_t CUDAGeneratorImpl::get_offset() const {
  // Debatable if get_offset() should be allowed in captured regions.
  // Conservatively disallow it for now.
  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::get_offset");
-  return philox_offset_per_thread_;
+  return state_->philox_offset_per_thread_;
 }

-#define CAPTURE_DEFAULT_GENS_MSG \
-"In regions captured by CUDA graphs, you may only use the default CUDA RNG " \
-"generator on the device that's current when capture begins. " \
-"If you need a non-default (user-supplied) generator, or a generator on another " \
-"device, please file an issue."
-
 /**
 * Gets the current seed of CUDAGeneratorImpl.
 */
@ -150,7 +302,7 @@ uint64_t CUDAGeneratorImpl::current_seed() const {
  // Debatable if current_seed() should be allowed in captured regions.
  // Conservatively disallow it for now.
  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::current_seed");
-  return seed_;
+  return state_->seed_;
 }

 /**
@ -194,6 +346,8 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
 * and size of the internal state.
 */
 void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
+  at::cuda::assertNotCapturing(
+      "Please ensure to utilize the CUDAGeneratorImpl::set_state_index method during capturing.");
  static const size_t seed_size = sizeof(uint64_t);
  static const size_t offset_size = sizeof(int64_t);
  static const size_t total_size = seed_size + offset_size;
@ -208,7 +362,7 @@ void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
    TORCH_CHECK(new_state_size == total_size, "RNG state is wrong size");
  }

-  uint64_t input_seed;
+  uint64_t input_seed = 0;
  auto new_rng_state = new_state.data_dtype_initialized<uint8_t>();
  memcpy(&input_seed, new_rng_state, seed_size);
  this->set_current_seed(input_seed);
@ -219,44 +373,59 @@ void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
  this->set_philox_offset_per_thread(static_cast<uint64_t>(philox_offset));
 }

+/**
+ * Sets the generator's current state to
+ * This function allows switching between different registered states of
+ * the generator.
+ */
+void CUDAGeneratorImpl::graphsafe_set_state(
+    const c10::intrusive_ptr<GeneratorImpl>& gen) {
+  c10::intrusive_ptr<CUDAGeneratorImpl> cuda_gen =
+      dynamic_intrusive_pointer_cast<CUDAGeneratorImpl>(gen);
+  TORCH_CHECK(cuda_gen, "Expected a CUDA Generator");
+  state_ = cuda_gen->state_;
+}
+
+/**
+ * Get the GeneratorImpl that point to current state_
+ */
+c10::intrusive_ptr<c10::GeneratorImpl> CUDAGeneratorImpl::graphsafe_get_state()
+    const {
+  auto gen = make_intrusive<CUDAGeneratorImpl>(device().index(), state_);
+  return gen;
+}
+
 /**
 * Sets the philox_offset_per_thread_ to be used by curandStatePhilox4_32_10
 *
 * See Note [Acquire lock when using random generators]
 */
 void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
-  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::set_philox_offset_per_thread");
  // see Note [Why enforce RNG offset % 4 == 0?]
  TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4");
-  philox_offset_per_thread_ = offset;
+  state_->philox_offset_per_thread_ = offset;
 }

 /**
 * Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl.
 */
 uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const {
-  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::philox_offset_per_thread");
-  return philox_offset_per_thread_;
+  return state_->philox_offset_per_thread_;
 }

 /**
- * Called by CUDAGraph to prepare this instance for a graph capture region.
- * offset_extragraph is the initial offset at the start of the graphed region.
- * offset_intragraph tracks the offset in the graphed region.
+ * Registers this state to a CUDA graph to manage within the graph.
 */
-void CUDAGeneratorImpl::capture_prologue(int64_t* seed_extragraph, int64_t* offset_extragraph) {
-  seed_extragraph_ = seed_extragraph;
-  offset_extragraph_ = offset_extragraph;
-  offset_intragraph_ = 0;
-  graph_expects_this_gen_ = true;
+void CUDAGeneratorImpl::register_graph(cuda::CUDAGraph* graph) {
+  graph->register_generator_state(state_);
+  state_->register_graph(graph);
 }

 /**
- * Called by CUDAGraph to finalize a graph capture region for this instance.
+ * Unregisters a CUDA graph from the RNG state.
 */
-uint64_t CUDAGeneratorImpl::capture_epilogue() {
-  graph_expects_this_gen_ = false;
-  return offset_intragraph_;
+void CUDAGeneratorImpl::unregister_graph(cuda::CUDAGraph* graph) {
+  state_->unregister_graph(graph);
 }

 /**
@ -281,30 +450,17 @@ uint64_t CUDAGeneratorImpl::capture_epilogue() {
 * See Note [Acquire lock when using random generators]
 */
 PhiloxCudaState CUDAGeneratorImpl::philox_cuda_state(uint64_t increment) {
-  // rounds increment up to the nearest multiple of 4
-  increment = ((increment + 3) / 4) * 4;
  if (at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None) {
-    TORCH_CHECK(graph_expects_this_gen_,
-                "philox_cuda_state for an unexpected CUDA generator used during capture. "
-                CAPTURE_DEFAULT_GENS_MSG);
-    // see Note [Why enforce RNG offset % 4 == 0?]
-    TORCH_INTERNAL_ASSERT(this->offset_intragraph_ % 4 == 0);
-    uint32_t offset = this->offset_intragraph_;
-    TORCH_INTERNAL_ASSERT(this->offset_intragraph_ <=
-                          std::numeric_limits<uint32_t>::max() - increment);
-    this->offset_intragraph_ += increment;
-    return PhiloxCudaState(this->seed_extragraph_,
-                           this->offset_extragraph_,
-                           offset);
+    uint32_t offset = state_->offset_intragraph_;
+    state_->increase(increment);
+    return PhiloxCudaState(
+        state_->seed_extragraph_.data_ptr<int64_t>(),
+        state_->offset_extragraph_.data_ptr<int64_t>(),
+        offset);
  } else {
-    TORCH_CHECK(!graph_expects_this_gen_,
-                "CUDA generator expects graph capture to be underway, "
-                "but the current stream is not capturing.");
-    // see Note [Why enforce RNG offset % 4 == 0?]
-    TORCH_INTERNAL_ASSERT(this->philox_offset_per_thread_ % 4 == 0);
-    uint64_t offset = this->philox_offset_per_thread_;
-    this->philox_offset_per_thread_ += increment;
-    return PhiloxCudaState(this->seed_, offset);
+    uint64_t offset = state_->philox_offset_per_thread_;
+    state_->increase(increment);
+    return PhiloxCudaState(state_->seed_, offset);
  }
 }

@ -312,16 +468,13 @@ PhiloxCudaState CUDAGeneratorImpl::philox_cuda_state(uint64_t increment) {
 * Temporarily accommodates call sites that use philox_engine_inputs.
 * Allows incremental refactor of call sites to use philox_cuda_state.
 */
-std::pair<uint64_t, uint64_t> CUDAGeneratorImpl::philox_engine_inputs(uint64_t increment) {
-  at::cuda::assertNotCapturing("Refactor this op to use CUDAGeneratorImpl::philox_cuda_state. "
-                               "Cannot call CUDAGeneratorImpl::philox_engine_inputs");
-  // rounds increment up to the nearest multiple of 4
-  increment = ((increment + 3) / 4) * 4;
-  // see Note [Why enforce RNG offset % 4 == 0?]
-  TORCH_INTERNAL_ASSERT(this->philox_offset_per_thread_ % 4 == 0);
-  uint64_t offset = this->philox_offset_per_thread_;
-  this->philox_offset_per_thread_ += increment;
-  return std::make_pair(this->seed_, offset);
+std::pair<uint64_t, uint64_t> CUDAGeneratorImpl::philox_engine_inputs(
+    uint64_t increment) {
+  at::cuda::assertNotCapturing(
+      "Refactor this op to use CUDAGeneratorImpl::philox_cuda_state. Cannot call CUDAGeneratorImpl::philox_engine_inputs");
+  uint64_t offset = state_->philox_offset_per_thread_;
+  state_->increase(increment);
+  return std::make_pair(state_->seed_, offset);
 }

 /*
@ -348,9 +501,7 @@ std::shared_ptr<CUDAGeneratorImpl> CUDAGeneratorImpl::clone() const {
 */
 CUDAGeneratorImpl* CUDAGeneratorImpl::clone_impl() const {
  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::clone_impl");
-  auto gen = new CUDAGeneratorImpl(this->device().index());
-  gen->set_current_seed(this->seed_);
-  gen->set_philox_offset_per_thread(this->philox_offset_per_thread_);
+  auto gen = new CUDAGeneratorImpl(this->device().index(), state_->clone());
  return gen;
 }

--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.h
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.h
@ -1,12 +1,19 @@
 #pragma once

-#include <ATen/core/Generator.h>
-#include <ATen/cuda/PhiloxCudaState.h>
 #include <ATen/Context.h>
-#include <limits>
+#include <ATen/core/Generator.h>
+#include <ATen/core/TensorBase.h>
+#include <ATen/cuda/PhiloxCudaState.h>
 #include <atomic>
-
+#include <limits>
+#include <memory>
+#include <unordered_set>
 namespace at {
+
+namespace cuda {
+struct CUDAGraph;
+}
+
 /**
 * Note [CUDA Graph-safe RNG states]
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -87,9 +94,41 @@ namespace at {
 *
 */

+struct CUDAGeneratorState : public c10::intrusive_ptr_target {
+  uint64_t seed_;
+  uint64_t philox_offset_per_thread_;
+  uint32_t offset_intragraph_;
+  bool capturing_{};
+  std::unordered_set<cuda::CUDAGraph*> registered_graphs_;
+  at::TensorBase seed_extragraph_{};
+  at::TensorBase offset_extragraph_{};
+
+  CUDAGeneratorState(
+      uint64_t seed = default_rng_seed_val,
+      uint64_t philox_offset_per_thread = 0,
+      uint32_t offset_intragraph = 0)
+      : seed_(seed),
+        philox_offset_per_thread_(philox_offset_per_thread),
+        offset_intragraph_(offset_intragraph) {}
+
+  void increase(uint64_t increment);
+
+  void register_graph(cuda::CUDAGraph* graph);
+  void unregister_graph(cuda::CUDAGraph* graph);
+
+  void capture_prologue();
+  // capture_epilogue returns the wholegraph_increment
+  uint64_t capture_epilogue();
+  void replay_prologue(uint64_t wholegraph_increment);
+  c10::intrusive_ptr<CUDAGeneratorState> clone();
+};
+
 struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
  // Constructors
  CUDAGeneratorImpl(DeviceIndex device_index = -1);
+  CUDAGeneratorImpl(
+      DeviceIndex device_index,
+      c10::intrusive_ptr<CUDAGeneratorState> state_);
  ~CUDAGeneratorImpl() override = default;

  // CUDAGeneratorImpl methods
@ -101,10 +140,18 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
  uint64_t seed() override;
  void set_state(const c10::TensorImpl& new_state) override;
  c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
+  void graphsafe_set_state(
+      const c10::intrusive_ptr<GeneratorImpl>& state) override;
+  c10::intrusive_ptr<c10::GeneratorImpl> graphsafe_get_state() const override;
+
  void set_philox_offset_per_thread(uint64_t offset);
  uint64_t philox_offset_per_thread() const;
-  void capture_prologue(int64_t* seed_extragraph, int64_t* offset_extragraph);
-  uint64_t capture_epilogue();
+
+  void register_graph(cuda::CUDAGraph* graph);
+  void unregister_graph(cuda::CUDAGraph* graph);
+
+  // Generates a PhiloxCudaState with a specified increment, and increment
+  // current state
  PhiloxCudaState philox_cuda_state(uint64_t increment);

  bool reset_rnn_state() {
@ -117,14 +164,10 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {

  static c10::DeviceType device_type();

-private:
+ private:
  CUDAGeneratorImpl* clone_impl() const override;
-  uint64_t seed_ = default_rng_seed_val;
-  uint64_t philox_offset_per_thread_ = 0;
-  int64_t* seed_extragraph_{};
-  int64_t* offset_extragraph_{};
-  uint32_t offset_intragraph_ = 0;
-  bool graph_expects_this_gen_ = false;
+
+  c10::intrusive_ptr<CUDAGeneratorState> state_;
  std::atomic_flag no_reset_rnn_state_;
 };

--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@ -6,7 +6,10 @@
 #include <c10/cuda/CUDAFunctions.h>

 #include <chrono>
+#include <cstddef>
+#include <cstdint>
 #include <thread>
+#include <vector>

 namespace at::cuda {

@ -86,26 +89,33 @@ CUDAGraph::CUDAGraph()
 #endif
 }

+void CUDAGraph::register_generator_state(
+    c10::intrusive_ptr<at::CUDAGeneratorState> state) {
+  captured_generator_states_[std::move(state)] = 0;
+}
+
+void CUDAGraph::register_generator_state(const at::Generator& generator) {
+  c10::intrusive_ptr<CUDAGeneratorImpl> cuda_gen =
+      dynamic_intrusive_pointer_cast<CUDAGeneratorImpl>(
+          generator.getIntrusivePtr());
+  cuda_gen->register_graph(this);
+}
+
 void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capture_mode) {
 #if !defined(USE_ROCM) || ROCM_VERSION >= 50300
  TORCH_CHECK(!has_graph_exec_,
              "This CUDAGraph instance already owns a captured graph. "
              "To capture a new graph, create a new instance.");

-  // For now, a CUDAGraph instance only accommodates the default generator on the device that's
-  // current when capture begins. If any op in the captured region uses a non-default generator,
-  // or a generator on another device, the offending generator will throw an error.
-  // These restrictions simplify CUDAGraph, but could be relaxed in the future:
-  // in principle, the underlying Cuda calls do permit cross-device ops to be captured.
+  // default generator is always registered
  auto* gen = get_generator_or_default<CUDAGeneratorImpl>(
      c10::nullopt, cuda::detail::getDefaultCUDAGenerator());
+  gen->register_graph(this);

-  auto options = TensorOptions().device(at::kCUDA).dtype(at::kLong);
-  seed_extragraph_ = at::empty({1}, options);
-  offset_extragraph_ = at::empty({1}, options);
-
-  seed_extragraph_.fill_(int64_t(gen->current_seed()));
-  gen->capture_prologue(seed_extragraph_.data_ptr<int64_t>(), offset_extragraph_.mutable_data_ptr<int64_t>());
+  for (auto& [generator_state, wholegraph_increments] :
+       captured_generator_states_) {
+    generator_state->capture_prologue();
+  }

  auto stream = at::cuda::getCurrentCUDAStream();

@ -115,7 +125,6 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
              "default stream.)");

  capture_stream_ = stream;
-  capture_gen_ = gen;
  capture_dev_ = c10::cuda::current_device();

  id_ = capture_sequence_id();
@ -215,13 +224,10 @@ void CUDAGraph::capture_end() {

  has_graph_exec_ = true;

-  auto* gen = get_generator_or_default<CUDAGeneratorImpl>(
-      c10::nullopt, cuda::detail::getDefaultCUDAGenerator());
-  TORCH_CHECK(gen == capture_gen_,
-              "Default CUDA RNG generator on current device at capture end "
-              "is different from default generator on current device "
-              "when capture began");
-  wholegraph_increment_ = gen->capture_epilogue();
+  for (auto& [generator_state, wholegraph_increments] :
+       captured_generator_states_) {
+    wholegraph_increments = generator_state->capture_epilogue();
+  }

  size_t numCUDAGraphNodes = 0;
  AT_CUDA_CHECK(cudaGraphGetNodes(graph_, NULL, &numCUDAGraphNodes));
@ -251,17 +257,10 @@ void CUDAGraph::replay() {

  c10::OptionalDeviceGuard device_guard{capture_stream_.device()};

-  // Just like any RNG consumer kernel!
-  auto* gen = get_generator_or_default<CUDAGeneratorImpl>(
-      c10::nullopt, cuda::detail::getDefaultCUDAGenerator());
-  PhiloxCudaState rng_engine_inputs;
-  {
-    std::lock_guard<std::mutex> lock(gen->mutex_);
-    rng_engine_inputs = gen->philox_cuda_state(wholegraph_increment_);
+  for (auto& [generator_state, wholegraph_increments] :
+       captured_generator_states_) {
+    generator_state->replay_prologue(wholegraph_increments);
  }
-  seed_extragraph_.fill_(int64_t(gen->current_seed()));
-  offset_extragraph_.fill_(int64_t(rng_engine_inputs.offset_.val));
-
  // graph_exec_ may be replayed in any stream.
  AT_CUDA_CHECK(cudaGraphLaunch(graph_exec_, at::cuda::getCurrentCUDAStream()));

@ -355,6 +354,10 @@ TORCH_CHECK(has_graph_exec_,
 }

 CUDAGraph::~CUDAGraph() {
+  for (auto& [generator_state, wholegraph_increments] :
+       captured_generator_states_) {
+    generator_state->unregister_graph(this);
+  }
  reset();
 }

--- a/aten/src/ATen/cuda/CUDAGraph.h
+++ b/aten/src/ATen/cuda/CUDAGraph.h
@ -4,12 +4,13 @@
 #include <c10/core/Device.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAStream.h>
-
-#include <mutex>
+#include <c10/util/flat_hash_map.h>

 namespace at {

+struct Generator;
 struct CUDAGeneratorImpl;
+struct CUDAGeneratorState;

 namespace cuda {

@ -24,7 +25,12 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
  static void inc_pending_event_queries();
  static void dec_pending_event_queries();
  static int num_pending_event_queries();
-  void capture_begin(MempoolId_t pool={0, 0}, cudaStreamCaptureMode capture_mode = cudaStreamCaptureModeGlobal);
+  // See Note [Explicit Registration of Generators to the CUDA Graph]
+  void register_generator_state(c10::intrusive_ptr<at::CUDAGeneratorState> state);
+  void register_generator_state(const at::Generator& generator);
+  void capture_begin(
+      MempoolId_t pool = {0, 0},
+      cudaStreamCaptureMode capture_mode = cudaStreamCaptureModeGlobal);
  void capture_end();
  void replay();
  void reset();
@ -32,7 +38,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
  void enable_debug_mode();
  void debug_dump(const std::string& debug_path);

-  protected:
+ protected:
 #if !defined(USE_ROCM) || ROCM_VERSION >= 50300
  cudaGraph_t graph_ = NULL;
  cudaGraphExec_t graph_exec_ = NULL;
@ -73,19 +79,16 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
  // Stream on which capture began
  at::cuda::CUDAStream capture_stream_;

-  // Default generator on device where capture began
-  at::CUDAGeneratorImpl* capture_gen_;
+  // multiple generator states and their wholegraph_increments in this graph
+  // that are managed by the CUDA Graph
+  ska::flat_hash_map<c10::intrusive_ptr<at::CUDAGeneratorState>, uint64_t>
+      captured_generator_states_;

  // Device where capture occurred. Right now, for simplicity, we require all ops
  // in a capture to run on the same device, but this is a limitation of CUDAGraph,
  // not CUDA itself.  We can straightforwardly modify CUDAGraph to support multi-device
  // captures if needed.
  int capture_dev_;
-
-  // RNG state trackers
-  at::Tensor seed_extragraph_;
-  at::Tensor offset_extragraph_;
-  uint64_t wholegraph_increment_;
 };

 } // namespace cuda
--- a/aten/src/ATen/cuda/jiterator.cu
+++ b/aten/src/ATen/cuda/jiterator.cu
@ -339,7 +339,7 @@ c10::SmallVector<at::Tensor> CompileAndLaunchKernel(
    config.add_owned_output(outs[i]);
  }
  for (const auto& t: tensors) {
-    config.add_input(t);
+    config.add_const_input(t);
  }
  TensorIterator iter = config.build();

--- a/aten/src/ATen/functorch/ADInterpreters.cpp
+++ b/aten/src/ATen/functorch/ADInterpreters.cpp
@ -73,7 +73,7 @@ static void autogradBasedTransformProcess(
    return materializeGradWrappers(tensor, current_level);
  };
  auto num_args = op.schema().arguments().size();
-  foreachTensorInplace(*stack, stack->size() - num_args, stack->size(), maybeTransformGradWrappers);
+  foreachTensorInplace(*stack, static_cast<int64_t>(stack->size() - num_args), static_cast<int64_t>(stack->size()), maybeTransformGradWrappers);

  setup_dispatch_key_tls(transform_type, {});
  op.callBoxed(stack);
@ -133,7 +133,7 @@ static void autogradBasedTransformSendToNext(
  auto args_size = op.schema().arguments().size();
  const auto ret_size = op.schema().returns().size();
  // Step 1
-  auto front = stack->size() - args_size;
+  auto front = static_cast<int64_t>(stack->size()) - args_size;
  for (const auto arg_idx : c10::irange(0, args_size)) {
    stack->push_back((*stack)[front + arg_idx]);
  }
@ -151,7 +151,7 @@ static void autogradBasedTransformSendToNext(
        // if the input is immutable, we find if it aliases anything, noting that
        // args are in reverse order on stack, so the last arg is at the top of the stack
        const auto relative_pos = idx - (stack->size() - args_size);
-        const auto aliased_out = findAliasedOutput(op.schema(), relative_pos);
+        const auto aliased_out = findAliasedOutput(op.schema(), static_cast<int64_t>(relative_pos));
        if (aliased_out.has_value()) {
          outputs_aliasing_immutable.flip(*aliased_out); // each output aliases at most one input, so we can only hit this once
        }
@ -160,7 +160,7 @@ static void autogradBasedTransformSendToNext(
  }

  // Step 2
-  foreachTensorInplace(*stack, stack->size() - args_size, stack->size(), unwrap);
+  foreachTensorInplace(*stack, static_cast<int64_t>(stack->size() - args_size), static_cast<int64_t>(stack->size()), unwrap);

  // See NOTE [grad and vjp interaction with no_grad]
  optional<c10::AutoGradMode> grad_guard;
@ -183,7 +183,7 @@ static void autogradBasedTransformSendToNext(
  op.callBoxed(stack);

  // Step 4
-  foreachTensorInplaceWithFlag(*stack, stack->size() - ret_size, stack->size(), outputs_aliasing_immutable, wrap);
+  foreachTensorInplaceWithFlag(*stack, static_cast<int64_t>(stack->size() - ret_size), static_cast<int64_t>(stack->size()), outputs_aliasing_immutable, wrap);

  // Step 5
  auto args_front = stack->size() - args_size - ret_size;
@ -200,7 +200,7 @@ static void autogradBasedTransformSendToNext(
  }

  // Step 6
-  stack->erase(stack->end() - (args_size + ret_size), stack->end() - ret_size);
+  stack->erase(stack->end() - std::ptrdiff_t(args_size + ret_size), stack->end() - std::ptrdiff_t(ret_size));
 }

 void GradInterpreterPtr::processImpl(
--- a/aten/src/ATen/functorch/BatchRulesConvolution.cpp
+++ b/aten/src/ATen/functorch/BatchRulesConvolution.cpp
@ -29,7 +29,7 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens

  // If we have a batched bias or weight, we need to perform the computation separately.
  optional<Tensor> unbatched_bias;
-  bool separate_bias;
+  bool separate_bias = false;
  if ((rhs_bdim && bias && bias->defined()) || bias_bdim) {
    TORCH_INTERNAL_ASSERT(bias.has_value());
    TORCH_INTERNAL_ASSERT(bias->defined());
@ -245,7 +245,7 @@ convolution_backward_input_batch_rule(
    const Tensor& input, optional<int64_t> input_bdim,
    const Tensor& weight, optional<int64_t> weight_bdim,
    c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed,
-    c10::SymIntArrayRef output_padding, c10::SymInt groups) {
+    c10::SymIntArrayRef output_padding, const c10::SymInt& groups) {
  const std::array<bool, 3> mask = {true, false, false};
  if (grad_output_bdim && weight_bdim) {
    // regular: BNO, BOI -> N(BO), (BO)I -> N(BI)
@ -326,7 +326,7 @@ convolution_backward_weight_batch_rule(
    const Tensor& input, optional<int64_t> input_bdim,
    const Tensor& weight, optional<int64_t> weight_bdim,
    c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed,
-    c10::SymIntArrayRef output_padding, c10::SymInt groups) {
+    c10::SymIntArrayRef output_padding, const c10::SymInt& groups) {
  const std::array<bool, 3> mask = {false, true, false};
  if (grad_output_bdim && input_bdim) {
    // BNO, BNI -> N(BO), N(BI) -> (BO)I (regular) (BI)O (transposed)
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@ -226,6 +226,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
  m.impl("reshape", native::reshape_symint);
  OP_DECOMPOSE(resolve_conj);
  OP_DECOMPOSE(resolve_neg);
+  OP_DECOMPOSE(rms_norm);
  OP_DECOMPOSE(row_stack);
  OP_DECOMPOSE(rrelu);
  OP_DECOMPOSE(rrelu_);
--- a/aten/src/ATen/functorch/BatchRulesHelper.cpp
+++ b/aten/src/ATen/functorch/BatchRulesHelper.cpp
@ -118,11 +118,9 @@ Tensor reshape_dim_outof(int64_t src, int64_t size1, const Tensor& x) {
    // NOTE: 0 % 0 leads to FPE
    TORCH_INTERNAL_ASSERT(shape[src] % size1 == 0);
  }
-  int64_t size2;
  // split any size out of `0`-sized dim
-  if (shape[src] == 0) {
-    size2 = 0;
-  } else {
+  int64_t size2 = 0;
+  if (shape[src] != 0) {
    size2 = shape[src] / size1;
  }
  shape[src] = size1;
@ -130,7 +128,7 @@ Tensor reshape_dim_outof(int64_t src, int64_t size1, const Tensor& x) {
  return at::reshape(x, shape);
 }

-Tensor reshape_dim_outof_symint(int64_t src, c10::SymInt size1, const Tensor& x) {
+Tensor reshape_dim_outof_symint(int64_t src, const c10::SymInt& size1, const Tensor& x) {
  src = maybe_wrap_dim(src, x.dim());
  c10::SymDimVector shape(x.sym_sizes().begin(), x.sym_sizes().end());
  if (shape[src] != 0) {
--- a/aten/src/ATen/functorch/BatchRulesHelper.h
+++ b/aten/src/ATen/functorch/BatchRulesHelper.h
@ -28,7 +28,7 @@ namespace at::functorch {
 TORCH_API Tensor reshape_dim_into(int64_t src, int64_t dst, const Tensor& x);
 TORCH_API Tensor reshape_dim_outof(int64_t src, int64_t size1, const Tensor& x);

-TORCH_API Tensor reshape_dim_outof_symint(int64_t src, c10::SymInt size1, const Tensor& x);
+TORCH_API Tensor reshape_dim_outof_symint(int64_t src, const c10::SymInt& size1, const Tensor& x);

 Tensor moveBatchDimToFront(const Tensor& tensor, optional<int64_t> maybe_batch_dim);
 int64_t rankWithoutBatchDim(const Tensor& tensor, optional<int64_t> maybe_batch_dim);
@ -146,7 +146,7 @@ void boxed_tensor_inputs_batch_rule(const c10::OperatorHandle& op, torch::jit::S
    if (ivalue.isTensor()) {
      auto [tensor_value, tensor_bdim] = unwrapTensorAtLevel(ivalue.toTensor(), cur_level);
      tensor_inputs.emplace_back(tensor_value, tensor_bdim);
-      tensor_pos.push_back(idx);
+      tensor_pos.push_back(static_cast<int64_t>(idx));
    }
  }
  Func(tensor_inputs);
@ -212,7 +212,7 @@ inline void find_and_unpack_tensors(
    int64_t* batch_size) {

  int64_t computed_batch_size = -1;
-  int64_t args_begin = stack->size() - num_args;
+  int64_t args_begin = static_cast<int64_t>(stack->size()) - num_args;

  for (const auto idx : c10::irange(0, num_args)) {
    const auto& ivalue = (*stack)[args_begin + idx];
@ -241,7 +241,7 @@ inline void boxed_existing_bdim_all_batch_rule(
    const c10::OperatorHandle& op, torch::jit::Stack* stack) {
  const auto& schema = op.schema();
  const auto num_returns = schema.returns().size();
-  const auto num_arguments = schema.arguments().size();
+  const auto num_arguments = static_cast<int64_t>(schema.arguments().size());

  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
  auto maybe_layer = maybeCurrentDynamicLayer();
@ -254,10 +254,10 @@ inline void boxed_existing_bdim_all_batch_rule(
    return;
  }

-  int64_t args_begin = stack->size() - num_arguments;
+  int64_t args_begin = static_cast<int64_t>(stack->size()) - num_arguments;
  SmallVector<UnpackedBatchedTensor, 5> tensor_inputs;
  SmallVector<int64_t, 5> tensor_pos;
-  int64_t batch_size;
+  int64_t batch_size = 0;

  find_and_unpack_tensors(
      stack, num_arguments, cur_level,
@ -310,13 +310,13 @@ inline void boxed_all_tensors_have_optional_bdim(
    return;
  }

-  int64_t args_begin = stack->size() - num_arguments;
+  int64_t args_begin = static_cast<int64_t>(stack->size() - num_arguments);
  SmallVector<UnpackedBatchedTensor, 5> tensor_inputs;
  SmallVector<int64_t, 5> tensor_pos;
-  int64_t batch_size;
+  int64_t batch_size = 0;

  find_and_unpack_tensors(
-      stack, num_arguments, cur_level,
+      stack, static_cast<int64_t>(num_arguments), cur_level,
      &tensor_inputs, &tensor_pos, &batch_size);

  optional<bool> is_no_batch_dim_case;
--- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
@ -370,7 +370,7 @@ fourOutputs solve_ex_batch_rule(
  TORCH_CHECK(A_logical_rank >= 2,
            "linalg.solve: The input tensor A must have at least 2 dimensions.");

-  int b_logical_rank = max_logical_rank;
+  auto b_logical_rank = max_logical_rank;
  if (A_logical_rank > B_logical_rank) {  // vector case: B was a vector or batched vector
    // not accurate but matches linalg error message
    TORCH_CHECK(B_logical_rank >= 1, "linalg.solve: The input tensor B must have at least 2 dimensions.");
@ -574,6 +574,7 @@ pinv_batch_rule(
  }

 // These need to be outside. String constant must be declared outside of a macro to be used as template param
+// NOLINTBEGIN(*array*)
 LINALG_CHECK_MATRIX_UNARY_ONE_OUT(cholesky, cholesky);
 LINALG_CHECK_MATRIX_UNARY_ONE_OUT(cholesky_inverse, cholesky_inverse);
 LINALG_CHECK_MATRIX_UNARY_TWO_OUT(linalg_cholesky_ex, linalg.cholesky);
@ -590,6 +591,7 @@ LINALG_CHECK_MATRIX_UNARY_THREE_OUT(_linalg_det, linalg.det);
 LINALG_CHECK_MATRIX_UNARY_TWO_OUT(_linalg_eigh, linalg.eigh);
 LINALG_CHECK_MATRIX_UNARY_FOUR_OUT(_linalg_slogdet, linalg.slogdet);
 LINALG_CHECK_MATRIX_UNARY_THREE_OUT(_linalg_svd, linalg.svd);
+// NOLINTEND(*array*)

 TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
  VMAP_SUPPORT(bmm, bmm_batch_rule);
--- a/aten/src/ATen/functorch/BatchRulesNorm.cpp
+++ b/aten/src/ATen/functorch/BatchRulesNorm.cpp
@ -474,7 +474,7 @@ C10_ALWAYS_INLINE void _check_layer_norm_inputs(
    const Tensor& weight, optional<int64_t> weight_bdim,
    const Tensor& bias, optional<int64_t> bias_bdim) {

-  const int normalized_ndim = normalized_shape.size();
+  const auto normalized_ndim = normalized_shape.size();
  TORCH_CHECK(
      normalized_ndim >= 1,
      "Expected normalized_shape to be at least 1-dimensional, i.e., ",
@ -616,7 +616,7 @@ static std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward_p
    if (num_front_dims_to_reduce == 0) {
      grad_bias = grad_out;
    } else {
-      grad_bias = grad_out.sum(range(0, num_front_dims_to_reduce));
+      grad_bias = grad_out.sum(range(0, static_cast<int64_t>(num_front_dims_to_reduce)));
    }
  }
  if (output_mask[1] && weight_value.has_value()) {
@ -628,7 +628,7 @@ static std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward_p
    if (num_front_dims_to_reduce == 0) {
      grad_weight = expanded_grad_weight;
    } else {
-      grad_weight = expanded_grad_weight.sum(range(0, num_front_dims_to_reduce));
+      grad_weight = expanded_grad_weight.sum(range(0, static_cast<int64_t>(num_front_dims_to_reduce)));
    }
  }
  if (output_mask[0]) {
--- a/aten/src/ATen/functorch/BatchRulesRandomness.cpp
+++ b/aten/src/ATen/functorch/BatchRulesRandomness.cpp
@ -199,8 +199,8 @@ static std::tuple<Tensor,Tensor> native_dropout_batching_rule(const Tensor& tens
    }
    auto [output, mask] = at::native_dropout(tensor_value, p, train);
    return std::make_tuple(
-        makeBatched(std::move(output), 0, cur_level),
-        makeBatched(std::move(mask), 0, cur_level));
+        makeBatched(output, 0, cur_level),
+        makeBatched(mask, 0, cur_level));
  }

  // repeated code from the CPU kernel since the CUDA one doesn't call bernoulli_ explicitly
@ -264,7 +264,7 @@ struct RandomBatchRuleHelper<F, Func, typelist<T1, T...>> {

 template <typename F, F Func, typename... T>
 Tensor rand_int_wrapper(SymIntArrayRef shape, c10::SymInt high, T... extra_args) {
-  return Func(high, std::move(shape), std::forward<T>(extra_args)...);
+  return Func(high, shape, std::forward<T>(extra_args)...);
 }

 template <typename A, A a, typename C>
--- a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
@ -75,7 +75,7 @@ static Tensor any_decomp(const Tensor& self) {
  return at::any(self.flatten(), 0, false);
 }

-enum ReductionCase { DimArray, Dim };
+enum class ReductionCase:uint8_t { DimArray, Dim };

 // Macros and templates have a difficult time dealing with enums,
 // so we didn't turn this into an enum.
@ -129,7 +129,7 @@ void boxed_reduction_batch_rule(const c10::OperatorHandle& op, torch::jit::Stack

  auto logical_dim = rankWithoutBatchDim(self, self_bdim);
  std::vector<int64_t> dims;
-  ReductionCase reduction_case;
+  ReductionCase reduction_case{};
  if (arguments[dim_arg_pos].isIntList()) {
    reduction_case = ReductionCase::DimArray;
    dims = arguments[dim_arg_pos].toIntList().vec();
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@ -11,7 +11,6 @@
 #include <ATen/native/TensorAdvancedIndexing.h>
 #include <ATen/native/IndexKernel.h>
 #include <ATen/native/IndexingUtils.h>
-#include <iostream>
 #include <torch/library.h>


@ -810,7 +809,7 @@ Tensor get_expanded_index(const Tensor& index, IntArrayRef self_size, int64_t di
  if (index.dim() == 0) {
    return index.expand(self_size);
  }
-  dim = maybe_wrap_dim(dim, self_size.size());
+  dim = maybe_wrap_dim(dim, static_cast<int64_t>(self_size.size()));

  // setup new_index_shape as [BS, 1, ..., idx_size, ..., 1]
  // to reshape index_
--- a/aten/src/ATen/functorch/BatchRulesViews.cpp
+++ b/aten/src/ATen/functorch/BatchRulesViews.cpp
@ -5,7 +5,6 @@
 // LICENSE file in the root directory of this source tree.

 #include <ATen/functorch/BatchRulesHelper.h>
-#include <iostream>
 #include <utility>

 #include <ATen/Operators.h>
@ -202,7 +201,7 @@ std::tuple<Tensor, optional<int64_t>> squeeze_batch_rule(const Tensor& self, opt
  int64_t new_batch_idx = 0;
  int64_t original_idx = 0;

-  for (auto it : shape) {
+  for (const auto& it : shape) {
    // Keep only dimensions != 1 and the batch dimension (irrespective of size).
    if (it != 1 || original_idx == bdim) {
      squeezed_sizes.push_back(it);
@ -452,7 +451,7 @@ std::tuple<Tensor, optional<int64_t>> expand_batch_rule(

  auto self_ = moveBatchDimToFront(self, self_bdim);
  auto self_sizes = self_.sym_sizes();
-  auto batch_size = self_sizes[0];
+  const auto& batch_size = self_sizes[0];

  c10::SmallVector<c10::SymInt> size_(size.size() + 1);
  size_[0] = batch_size;
--- a/aten/src/ATen/functorch/BatchedFallback.cpp
+++ b/aten/src/ATen/functorch/BatchedFallback.cpp
@ -159,7 +159,7 @@ static void batchedTensorInplaceForLoopFallback(const c10::OperatorHandle& op, t
        "please file a bug report instead.");
    }
    batched_tensor_inputs.push_back(tensor);
-    batched_tensor_inputs_position.push_back(idx);
+    batched_tensor_inputs_position.push_back(static_cast<int64_t>(idx));
  }
  TORCH_INTERNAL_ASSERT(!batched_tensor_inputs.empty());

@ -304,7 +304,7 @@ void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Sta
      continue;
    }
    batched_tensor_inputs.push_back(tensor);
-    batched_tensor_inputs_position.push_back(idx);
+    batched_tensor_inputs_position.push_back(static_cast<int64_t>(idx));
  }
  TORCH_INTERNAL_ASSERT(!batched_tensor_inputs.empty());

@ -445,18 +445,18 @@ void batchedNestedTensorForLoopFallback(const c10::OperatorHandle& op, torch::ji
      continue;
    }
    batched_tensor_inputs.push_back(tensor);
-    batched_tensor_inputs_position.push_back(idx);
+    batched_tensor_inputs_position.push_back(static_cast<int64_t>(idx));
  }
  TORCH_INTERNAL_ASSERT(!batched_tensor_inputs.empty());

  std::vector<std::vector<Tensor>> unbound;
-  for (auto iter = batched_tensor_inputs.begin(); iter != batched_tensor_inputs.end(); ++iter) {
-    auto *batched_impl = maybeGetBatchedImpl(*iter);
+  for (auto const &batched_tensor_input: batched_tensor_inputs) {
+    auto *batched_impl = maybeGetBatchedImpl(batched_tensor_input);
    TORCH_INTERNAL_ASSERT(batched_impl->value().is_nested() || batched_impl->bdim() == 0,
        "Fallback not supported for mixed nested / non-nested arguments without bdim=0");
    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::BatchedNestedTensor);
    auto this_unbound = batched_impl->value().unbind();
-    if (unbound.size() > 0) {
+    if (!unbound.empty()) {
      TORCH_INTERNAL_ASSERT(unbound.front().size() == this_unbound.size(),
          "Fallback not supported for differently-sized nested arguments");
    }
--- a/aten/src/ATen/functorch/BatchedTensorImpl.cpp
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.cpp
@ -70,7 +70,7 @@ void BatchedTensorImpl::refreshTensorMetadata() {
 int64_t BatchedTensorImpl::actualDim(int64_t dim, bool wrap_dim) const {
  if (wrap_dim) {
    const auto ndim = sizes_and_strides_.size();
-    dim = maybe_wrap_dim(dim, ndim);
+    dim = maybe_wrap_dim(dim, static_cast<int64_t>(ndim));
  }
  if (bdim_ <= dim) {
    return dim + 1;
@ -160,6 +160,7 @@ c10::intrusive_ptr<TensorImpl> BatchedTensorImpl::shallow_copy_and_detach(
 }

 c10::intrusive_ptr<TensorImpl> BatchedTensorImpl::shallow_copy_and_detach(
+    // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
    c10::VariableVersion&& version_counter,
    bool allow_tensor_metadata_change) const {
  TORCH_CHECK(false, "accessing `data` under vmap transform is not allowed");
--- a/aten/src/ATen/functorch/BatchedTensorImpl.h
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.h
@ -7,7 +7,6 @@
 #pragma once

 #include <bitset>
-#include <utility>

 #include <ATen/ArrayRef.h>
 #include <ATen/SmallVector.h>
@ -119,15 +118,15 @@ inline bool isBatchedTensor(const Tensor& tensor) {

 // It is unsafe to call this on a Tensor that is not backed by a
 // BatchedTensorImpl. Please use `maybeGetBatchedImpl` whenever possible.
-inline BatchedTensorImpl* unsafeGetBatchedImpl(Tensor tensor) {
+inline BatchedTensorImpl* unsafeGetBatchedImpl(const Tensor& tensor) {
  return static_cast<BatchedTensorImpl*>(tensor.unsafeGetTensorImpl());
 }

-inline BatchedTensorImpl* maybeGetBatchedImpl(Tensor tensor) {
+inline BatchedTensorImpl* maybeGetBatchedImpl(const Tensor& tensor) {
  if (!isBatchedTensor(tensor)) {
    return nullptr;
  }
-  return unsafeGetBatchedImpl(std::move(tensor));
+  return unsafeGetBatchedImpl(tensor);
 }

 // Returns a bitset. If bit i is set, then that means dim i is a batchdim.
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@ -234,7 +234,7 @@ int64_t pushDynamicLayer(DynamicLayer&& dynamic_layer) {
  auto& dynamicLayerStack = dynamicLayerStackAccessor();
  int64_t layerId = 1 + dynamicLayerStack.size();
  TORCH_INTERNAL_ASSERT(layerId == dynamic_layer.layerId());
-  dynamicLayerStack.emplace_back(dynamic_layer);
+  dynamicLayerStack.emplace_back(std::move(dynamic_layer));

  if (layerId == 1) {
    setDynamicLayerFrontBackKeysIncluded(true);
@ -257,7 +257,7 @@ int64_t initAndPushDynamicLayer(
    optional<bool> functionalize_add_back_views) {
  const auto& dynamicLayerStack = dynamicLayerStackAccessor();
  const auto layerId = 1 + dynamicLayerStack.size();
-  DynamicLayer new_layer(transform_type, layerId, batch_size, randomness, prev_grad_mode, prev_fwd_grad_mode, functionalize_add_back_views);
+  DynamicLayer new_layer(transform_type, layerId, std::move(batch_size), randomness, prev_grad_mode, prev_fwd_grad_mode, functionalize_add_back_views);
  // NB: this function should be called while holding the GIL to avoid races
  new_layer.interpreter().set_is_alive(true);
  pushDynamicLayer(std::move(new_layer));
@ -306,7 +306,7 @@ void foreachTensorInplace(std::vector<IValue>& args, int64_t begin, int64_t end,
 }

 void foreachTensorInplaceWithFlag(std::vector<IValue>& args, int64_t begin, int64_t end,
-    const std::bitset<64> use_flag_relative, std::function<Tensor(const Tensor&, bool)> func){
+    const std::bitset<64> use_flag_relative, const std::function<Tensor(const Tensor&, bool)>& func){
  TORCH_INTERNAL_ASSERT(begin >= 0);
  TORCH_INTERNAL_ASSERT(end >= 0);
  TORCH_INTERNAL_ASSERT(begin <= end);
--- a/aten/src/ATen/functorch/Interpreter.cpp
+++ b/aten/src/ATen/functorch/Interpreter.cpp
@ -6,8 +6,6 @@
 #include <ATen/functorch/ADInterpreters.h>
 #include <ATen/functorch/DynamicLayer.h>

-#include <utility>
-
 namespace at::functorch {

 static DispatchKeySet get_all_dynlayer_keyset() {
@ -92,12 +90,12 @@ std::ostream& operator<<(std::ostream& os, const TransformType& t) {

 void sanityCheckStack(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
  auto num_args = op.schema().arguments().size();
-  foreachTensorInplace(*stack, stack->size() - num_args, stack->size(),
+  foreachTensorInplace(*stack, static_cast<int64_t>(stack->size() - num_args), static_cast<int64_t>(stack->size()),
      [](const Tensor& tensor) {
        auto result = unwrapIfDead(tensor);
        auto* wrapper = maybeGetTensorWrapper(result);
        TORCH_INTERNAL_ASSERT(wrapper == nullptr);
-        auto* batched = maybeGetBatchedImpl(std::move(result));
+        auto* batched = maybeGetBatchedImpl(result);
        TORCH_INTERNAL_ASSERT(batched == nullptr);
        return tensor;
      });
--- a/aten/src/ATen/functorch/Interpreter.h
+++ b/aten/src/ATen/functorch/Interpreter.h
@ -5,6 +5,7 @@
 #include <c10/core/impl/LocalDispatchKeySet.h>
 #include <c10/util/Optional.h>
 #include <bitset>
+#include <utility>
 #include <variant>

 namespace at::functorch {
@ -144,7 +145,7 @@ struct Interpreter {

  void saveLocalDispatchKeySet(c10::impl::LocalDispatchKeySet keyset) {
    TORCH_INTERNAL_ASSERT(!savedLocalDispatchKeySet_.has_value());
-    savedLocalDispatchKeySet_ = std::move(keyset);
+    savedLocalDispatchKeySet_ = keyset;
  }
  void clearSavedLocalDispatchKeySet() {
    TORCH_INTERNAL_ASSERT(savedLocalDispatchKeySet_.has_value());
@ -173,11 +174,11 @@ struct Interpreter {

 private:
  explicit Interpreter(TransformType type, int64_t level, InterpreterMeta meta):
-    type_(type), level_(level), is_alive_(std::make_shared<bool>(false)), meta_(meta) {}
+    type_(type), level_(level), is_alive_(std::make_shared<bool>(false)), meta_(std::move(meta)) {}

  // fields
-  TransformType type_;
-  int64_t level_;
+  TransformType type_{};
+  int64_t level_{};
  optional<c10::impl::LocalDispatchKeySet> savedLocalDispatchKeySet_;
  std::shared_ptr<bool> is_alive_;
  InterpreterMeta meta_;
@ -195,7 +196,7 @@ void foreachTensorInplace(std::vector<IValue>& args, int64_t begin, int64_t end,
 //     args[i] = func(args[i], i - begin, true)
 //   args[i] = func(args[i], i - begin)
 void foreachTensorInplaceWithFlag(std::vector<IValue>& args, int64_t begin, int64_t end,
-    const std::bitset<64> use_flag_relative, std::function<Tensor(const Tensor&, bool)> func);
+    const std::bitset<64> use_flag_relative, const std::function<Tensor(const Tensor&, bool)>& func);

 std::vector<int64_t> findUnwrappedInputs(std::vector<IValue>& args, int64_t begin, int64_t end);

--- a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
+++ b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
@ -286,7 +286,7 @@ std::vector<Tensor> unbind_batching_rule(const Tensor& self, int64_t dim) {
 // can be indexed (or nullopt if such a location doesn't exist, e.g., tensors
 // with zero-size dims).
 static optional<c10::SymInt> maximum_indexable_location(
-    c10::SymIntArrayRef sizes, c10::SymIntArrayRef strides, c10::SymInt storage_offset) {
+    c10::SymIntArrayRef sizes, c10::SymIntArrayRef strides, const c10::SymInt& storage_offset) {
  auto result = native::storage_size_for(sizes, strides);
  if (result == 0) {
    return nullopt;
@ -303,7 +303,7 @@ static void checkBasicAsStridedValidForSlice(
    int64_t num_batch_dims,
    c10::SymIntArrayRef sizes,
    c10::SymIntArrayRef strides,
-    optional<c10::SymInt> maybe_storage_offset) {
+    const optional<c10::SymInt>& maybe_storage_offset) {
  auto slice_sizes = physical_tensor.sym_sizes().slice(num_batch_dims);
  auto slice_strides = physical_tensor.sym_strides().slice(num_batch_dims);
  auto base_offset = physical_tensor.sym_storage_offset();
@ -693,17 +693,17 @@ Tensor new_empty_strided_batching_rule(
 }

 Tensor nested_cat_batching_rule(const ITensorListRef& tensors, int64_t dim) {
-  TORCH_CHECK(tensors.size() > 0, "cat() not supported on empty tensor list");
+  TORCH_CHECK(!tensors.empty(), "cat() not supported on empty tensor list");

  std::vector<std::vector<Tensor>> unbound;
-  for (auto tensor_iter = tensors.begin(); tensor_iter != tensors.end(); ++tensor_iter) {
-    auto* maybe_batched_impl = maybeGetBatchedImpl(*tensor_iter);
+  for (const auto & tensor : tensors) {
+    auto* maybe_batched_impl = maybeGetBatchedImpl(tensor);
    TORCH_CHECK(maybe_batched_impl, "Tried to run batching rule for cat() on a non-batched tensor");
    auto nt = maybe_batched_impl->value();
    TORCH_CHECK(nt.is_nested(), "Tried to run batching rule for cat() on a non-nested tensor");
    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::BatchedNestedTensor);
    auto this_unbound = nt.unbind();
-    if (unbound.size() > 0) {
+    if (!unbound.empty()) {
      TORCH_INTERNAL_ASSERT(unbound.front().size() == this_unbound.size(),
          "cat() not supported for differently-sized nested arguments");
    }
--- a/aten/src/ATen/functorch/LegacyVmapTransforms.cpp
+++ b/aten/src/ATen/functorch/LegacyVmapTransforms.cpp
@ -135,7 +135,7 @@ MultiBatchVmapTransform::logicalToPhysical(ITensorListRef logical_tensors) {
  TORCH_INTERNAL_ASSERT(bdim_size != -1);

  std::bitset<kVmapNumLevels> levels;
-  levels[cur_level] = 1;
+  levels[cur_level] = true;

  VmapPhysicalViewVec result;
  for (const auto& logical_tensor : logical_tensors) {
@ -184,7 +184,7 @@ VmapPhysicalViewVec BroadcastingVmapTransform::logicalToPhysical(TensorList logi
  TORCH_INTERNAL_ASSERT(bdim_size != -1);

  std::bitset<kVmapNumLevels> levels;
-  levels[cur_level] = 1;
+  levels[cur_level] = true;

  // figure out the example ndim
  int64_t max_example_dim = -1;
--- a/aten/src/ATen/functorch/LegacyVmapTransforms.h
+++ b/aten/src/ATen/functorch/LegacyVmapTransforms.h
@ -120,7 +120,7 @@ struct VmapPhysicalToLogicalMap;
 //   levels: 012345
 struct TORCH_API VmapPhysicalView {
  VmapPhysicalView(Tensor&& tensor, std::bitset<kVmapNumLevels> levels)
-      : levels_(levels), tensor_(tensor) {
+      : levels_(levels), tensor_(std::move(tensor)) {
    // TORCH_INTERNAL_ASSERT(!isBatchedTensor(tensor));
  }

--- a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
+++ b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
@ -167,7 +167,7 @@ namespace dropout_hack {
 namespace {

 template<bool inplace>
-using Ctype = typename std::conditional<inplace, Tensor&, Tensor>::type;
+using Ctype = std::conditional_t<inplace, Tensor&, Tensor>;

 static Tensor make_feature_noise(const Tensor& input) {
  auto input_sizes = input.sizes();
--- a/aten/src/ATen/functorch/TensorWrapper.cpp
+++ b/aten/src/ATen/functorch/TensorWrapper.cpp
@ -50,7 +50,7 @@ void TensorWrapper::refreshMetadata() {
 void dumpTensorCout(const Tensor& tensor) {
  dumpTensor(std::cout, tensor);

-  std::cout << std::endl;
+  std::cout << '\n';
 }

 static c10::intrusive_ptr<TensorWrapper> makeTensorWrapperPtr(const Tensor& tensor, int64_t level, const std::shared_ptr<bool>& life_handle) {
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@ -649,8 +649,8 @@ void apply_ormqr(const Tensor& input, const Tensor& tau, const Tensor& other, bo
  char side = left ? 'L' : 'R';
  char trans = transpose ? (input.is_complex() ? 'C' : 'T') : 'N';

-  auto input_data = input.data_ptr<scalar_t>();
-  auto tau_data = tau.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
+  auto tau_data = tau.const_data_ptr<scalar_t>();
  auto other_data = other.data_ptr<scalar_t>();

  auto input_matrix_stride = matrixStride(input);
@ -670,21 +670,21 @@ void apply_ormqr(const Tensor& input, const Tensor& tau, const Tensor& other, bo
  // Query for the optimal size of the workspace tensor
  int lwork = -1;
  scalar_t wkopt;
-  lapackOrmqr<scalar_t>(side, trans, m, n, k, input_data, lda, tau_data, other_data, ldc, &wkopt, lwork, &info);
+  lapackOrmqr<scalar_t>(side, trans, m, n, k, const_cast<scalar_t*>(input_data), lda, const_cast<scalar_t*>(tau_data), other_data, ldc, &wkopt, lwork, &info);
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info == 0);
  lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
  Tensor work = at::empty({lwork}, input.options());

  for (const auto i : c10::irange(batch_size)) {
-    scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
+    const scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
    scalar_t* other_working_ptr = &other_data[i * other_matrix_stride];
-    scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
+    const scalar_t* tau_working_ptr = &tau_data[i * tau_stride];

    // now compute the actual result
    lapackOrmqr<scalar_t>(
        side, trans, m, n, k,
-        input_working_ptr, lda,
-        tau_working_ptr,
+        const_cast<scalar_t*>(input_working_ptr), lda,
+        const_cast<scalar_t*>(tau_working_ptr),
        other_working_ptr, ldc,
        work.data_ptr<scalar_t>(), lwork, &info);

--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@ -2,6 +2,7 @@
 #include <ATen/Context.h>
 #include <ATen/Config.h>
 #include <ATen/OpMathType.h>
+#include <ATen/Parallel.h>
 #include <c10/core/ScalarType.h>
 #include <c10/util/Exception.h>
 #include <c10/util/complex.h>
@ -210,34 +211,39 @@ static inline float16_t reduce(float16x4_t x) {
        auto sum = vpadd_f16(x, x);
        return vget_lane_f16(vpadd_f16(sum, sum), 0);
 }
+static inline float16_t reduce(float16x8_t x) {
+        return reduce(vadd_f16(vget_low_f16(x), vget_high_f16(x)));
+}


 static void fp16_gemv_trans_fp16_arith(const int m, const int n, const float16_t* a, const int lda, const float16_t *x, float16_t* y, int incy) {
-  for (auto i = 0 ; i < n; i += 4) {
-    float16x4_t sum0Vec = vdup_n_f16(0);
-    float16x4_t sum1Vec = vdup_n_f16(0);
-    float16x4_t sum2Vec = vdup_n_f16(0);
-    float16x4_t sum3Vec = vdup_n_f16(0);
-    const auto row0 = a + lda * (i + 0);
-    const auto row1 = a + lda * (i + 1);
-    const auto row2 = a + lda * (i + 2);
-    const auto row3 = a + lda * (i + 3);
-    for (auto j = 0; j < m; j += 4) {
-      float16x4_t a0Vec = vld1_f16(row0 + j);
-      float16x4_t a1Vec = vld1_f16(row1 + j);
-      float16x4_t a2Vec = vld1_f16(row2 + j);
-      float16x4_t a3Vec = vld1_f16(row3 + j);
-      float16x4_t xVec = vld1_f16(x + j);
-      sum0Vec = vadd_f16(sum0Vec, vmul_f16(a0Vec, xVec));
-      sum1Vec = vadd_f16(sum1Vec, vmul_f16(a1Vec, xVec));
-      sum2Vec = vadd_f16(sum2Vec, vmul_f16(a2Vec, xVec));
-      sum3Vec = vadd_f16(sum3Vec, vmul_f16(a3Vec, xVec));
+  parallel_for(0, n / 4, 1, [&](int begin, int end) {
+    for (auto i = begin * 4 ; i < end * 4; i += 4) {
+      float16x8_t sum0Vec = vdupq_n_f16(0);
+      float16x8_t sum1Vec = vdupq_n_f16(0);
+      float16x8_t sum2Vec = vdupq_n_f16(0);
+      float16x8_t sum3Vec = vdupq_n_f16(0);
+      const auto row0 = a + lda * (i + 0);
+      const auto row1 = a + lda * (i + 1);
+      const auto row2 = a + lda * (i + 2);
+      const auto row3 = a + lda * (i + 3);
+      for (auto j = 0; j < m; j += 8) {
+        float16x8_t xVec = vld1q_f16(x + j);
+        float16x8_t a0Vec = vld1q_f16(row0 + j);
+        sum0Vec = vaddq_f16(sum0Vec, vmulq_f16(a0Vec, xVec));
+        float16x8_t a1Vec = vld1q_f16(row1 + j);
+        sum1Vec = vaddq_f16(sum1Vec, vmulq_f16(a1Vec, xVec));
+        float16x8_t a2Vec = vld1q_f16(row2 + j);
+        sum2Vec = vaddq_f16(sum2Vec, vmulq_f16(a2Vec, xVec));
+        float16x8_t a3Vec = vld1q_f16(row3 + j);
+        sum3Vec = vaddq_f16(sum3Vec, vmulq_f16(a3Vec, xVec));
+      }
+      y[(i + 0) * incy] = reduce(sum0Vec);
+      y[(i + 1) * incy] = reduce(sum1Vec);
+      y[(i + 2) * incy] = reduce(sum2Vec);
+      y[(i + 3) * incy] = reduce(sum3Vec);
    }
-    y[(i + 0) * incy] = reduce(sum0Vec);
-    y[(i + 1) * incy] = reduce(sum1Vec);
-    y[(i + 2) * incy] = reduce(sum2Vec);
-    y[(i + 3) * incy] = reduce(sum3Vec);
-  }
+  });
 }
 #endif

@ -247,31 +253,33 @@ static inline float reduce(float32x4_t x) {
 }

 static void fp16_gemv_trans_fp32_arith(const int m, const int n, const float16_t* a, const int lda, const float16_t *x, float16_t* y, int incy) {
-  for (auto i = 0 ; i < n; i += 4) {
-    float32x4_t sum0Vec = vdupq_n_f32(0);
-    float32x4_t sum1Vec = vdupq_n_f32(0);
-    float32x4_t sum2Vec = vdupq_n_f32(0);
-    float32x4_t sum3Vec = vdupq_n_f32(0);
-    const auto row0 = a + lda * (i + 0);
-    const auto row1 = a + lda * (i + 1);
-    const auto row2 = a + lda * (i + 2);
-    const auto row3 = a + lda * (i + 3);
-    for (auto j = 0; j < m; j += 4) {
-      float32x4_t a0Vec = vcvt_f32_f16(vld1_f16(row0 + j));
-      float32x4_t a1Vec = vcvt_f32_f16(vld1_f16(row1 + j));
-      float32x4_t a2Vec = vcvt_f32_f16(vld1_f16(row2 + j));
-      float32x4_t a3Vec = vcvt_f32_f16(vld1_f16(row3 + j));
-      float32x4_t xVec = vcvt_f32_f16(vld1_f16(x + j));
-      sum0Vec = vaddq_f32(sum0Vec, vmulq_f32(a0Vec, xVec));
-      sum1Vec = vaddq_f32(sum1Vec, vmulq_f32(a1Vec, xVec));
-      sum2Vec = vaddq_f32(sum2Vec, vmulq_f32(a2Vec, xVec));
-      sum3Vec = vaddq_f32(sum3Vec, vmulq_f32(a3Vec, xVec));
+  parallel_for(0, n / 4, 1, [&](int begin, int end) {
+    for (auto i =  begin * 4 ; i < end * 4; i += 4) {
+      float32x4_t sum0Vec = vdupq_n_f32(0);
+      float32x4_t sum1Vec = vdupq_n_f32(0);
+      float32x4_t sum2Vec = vdupq_n_f32(0);
+      float32x4_t sum3Vec = vdupq_n_f32(0);
+      const auto row0 = a + lda * (i + 0);
+      const auto row1 = a + lda * (i + 1);
+      const auto row2 = a + lda * (i + 2);
+      const auto row3 = a + lda * (i + 3);
+      for (auto j = 0; j < m; j += 4) {
+        float32x4_t xVec = vcvt_f32_f16(vld1_f16(x + j));
+        float32x4_t a0Vec = vcvt_f32_f16(vld1_f16(row0 + j));
+        sum0Vec = vaddq_f32(sum0Vec, vmulq_f32(a0Vec, xVec));
+        float32x4_t a1Vec = vcvt_f32_f16(vld1_f16(row1 + j));
+        sum1Vec = vaddq_f32(sum1Vec, vmulq_f32(a1Vec, xVec));
+        float32x4_t a2Vec = vcvt_f32_f16(vld1_f16(row2 + j));
+        sum2Vec = vaddq_f32(sum2Vec, vmulq_f32(a2Vec, xVec));
+        float32x4_t a3Vec = vcvt_f32_f16(vld1_f16(row3 + j));
+        sum3Vec = vaddq_f32(sum3Vec, vmulq_f32(a3Vec, xVec));
+      }
+      y[(i + 0) * incy] = reduce(sum0Vec);
+      y[(i + 1) * incy] = reduce(sum1Vec);
+      y[(i + 2) * incy] = reduce(sum2Vec);
+      y[(i + 3) * incy] = reduce(sum3Vec);
    }
-    y[(i + 0) * incy] = reduce(sum0Vec);
-    y[(i + 1) * incy] = reduce(sum1Vec);
-    y[(i + 2) * incy] = reduce(sum2Vec);
-    y[(i + 3) * incy] = reduce(sum3Vec);
-  }
+  });
 }

 void fp16_gemv_trans(
@ -287,8 +295,8 @@ void fp16_gemv_trans(
    const int incy) {
  if (incx == 1 && alpha == 1.0 && beta == 0.0 && m % 4 == 0 && n % 4 == 0) {
 #ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
-    return at::globalContext().allowFP16ReductionCPU() ? fp16_gemv_trans_fp16_arith(m, n, a, lda, x, y, incy)
-                                                       : fp16_gemv_trans_fp32_arith(m, n, a, lda, x, y, incy);
+    return at::globalContext().allowFP16ReductionCPU() && m % 8 == 0 ? fp16_gemv_trans_fp16_arith(m, n, a, lda, x, y, incy)
+                                                                     : fp16_gemv_trans_fp32_arith(m, n, a, lda, x, y, incy);
 #else
    return fp16_gemv_trans_fp32_arith(m, n, a, lda, x, y, incy);
 #endif
--- a/aten/src/ATen/native/Bucketization.cpp
+++ b/aten/src/ATen/native/Bucketization.cpp
@ -92,9 +92,9 @@ void searchsorted_cpu_contiguous(Tensor& result, const Tensor& input, const Tens
  int64_t idim_in = is_scalar_input ? 1 : input.sizes().back();
  int64_t idim_bd = boundaries.sizes().back();

-  const input_t *data_in = input.data_ptr<input_t>();
-  const input_t *data_bd = boundaries.data_ptr<input_t>();
-  const int64_t *data_st = sorter.defined() ? sorter.data_ptr<int64_t>() : nullptr;
+  const input_t *data_in = input.const_data_ptr<input_t>();
+  const input_t *data_bd = boundaries.const_data_ptr<input_t>();
+  const int64_t *data_st = sorter.defined() ? sorter.const_data_ptr<int64_t>() : nullptr;
  output_t *data_out = result.data_ptr<output_t>();

  bool is_1d_boundaries = boundaries.dim() == 1;
--- a/aten/src/ATen/native/ConvolutionMM2d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM2d.cpp
@ -61,7 +61,7 @@ static Tensor compute_columns2d(
        kernel_height * kernel_width * n_input_plane : output_height * output_width;
    columns = at::empty({batch_size, row, col}, input.options());
    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "slow_conv2d_cpu", [&]{
-      auto input_a = input.accessor<scalar_t, 4>();
+      auto input_a = input.accessor<const scalar_t, 4>();
      auto columns_a = columns.accessor<scalar_t, 3>();

      at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
@ -220,9 +220,9 @@ static inline Tensor view_weight_2d(const Tensor& weight_,

 template <typename scalar_t>
 static void slow_conv2d_update_output_frame(
-    TensorAccessor<scalar_t, 3> input,
+    TensorAccessor<const scalar_t, 3> input,
    TensorAccessor<scalar_t, 3> output,
-    TensorAccessor<scalar_t, 2> weight,
+    TensorAccessor<const scalar_t, 2> weight,
    bool has_bias,
    TensorAccessor<scalar_t, 2> finput,
    int64_t kernel_height,
@ -588,10 +588,10 @@ Tensor& slow_conv2d_forward_out_cpu(
  TORCH_CHECK(output.is_contiguous(memory_format), "slow_conv2d output tensor must be contiguous");

  AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "slow_conv2d_cpu", [&]{
-    auto input_a = input.accessor<scalar_t, 4>();
+    auto input_a = input.accessor<const scalar_t, 4>();
    auto output_a = output.accessor<scalar_t, 4>();
    auto finput_a = finput.accessor<scalar_t, 3>();
-    auto weight_2d_a = weight_2d.accessor<scalar_t, 2>();
+    auto weight_2d_a = weight_2d.accessor<const scalar_t, 2>();

    at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
      for (const auto t : c10::irange(start, end)) {
--- a/aten/src/ATen/native/ConvolutionMM3d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM3d.cpp
@ -72,7 +72,7 @@ static Tensor compute_columns3d(
                        input.options());

    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "compute_columns3d", [&] {
-      auto input_a = input.accessor<scalar_t, 5>();
+      auto input_a = input.accessor<const scalar_t, 5>();
      auto columns_a = columns.accessor<scalar_t, 3>();

      at::parallel_for(0, batch_size, CONV3D_GRAIN_SALT, [&](int64_t start, int64_t end) {
@ -261,11 +261,11 @@ static Tensor view_weight_2d(const Tensor& weight_) {

 template <typename scalar_t>
 static void slow_conv3d_update_output_frame(
-    TensorAccessor<scalar_t, 4> input,
+    TensorAccessor<const scalar_t, 4> input,
    TensorAccessor<scalar_t, 4> output,
-    TensorAccessor<scalar_t, 2> weight,
+    TensorAccessor<const scalar_t, 2> weight,
    bool has_bias,
-    TensorAccessor<scalar_t, 2> finput,
+    TensorAccessor<const scalar_t, 2> finput,
    int64_t kernel_depth,
    int64_t kernel_height,
    int64_t kernel_width,
@ -623,10 +623,10 @@ Tensor& slow_conv3d_forward_out_cpu(const Tensor& self,
  TORCH_CHECK(output.is_contiguous(), "slow_conv3d output must be contiguous");

  AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "slow_conv3d_cpu", [&] {
-    auto input_a = input.accessor<scalar_t, 5>();
+    auto input_a = input.accessor<const scalar_t, 5>();
    auto output_a = output.accessor<scalar_t, 5>();
-    auto finput_a = finput.accessor<scalar_t, 3>();
-    auto weight_2d_a = weight_2d.accessor<scalar_t, 2>();
+    auto finput_a = finput.accessor<const scalar_t, 3>();
+    auto weight_2d_a = weight_2d.accessor<const scalar_t, 2>();

    at::parallel_for(
        0, batch_size, CONV3D_GRAIN_SALT, [&](int64_t start, int64_t end) {
--- a/aten/src/ATen/native/ForeachUtils.h
+++ b/aten/src/ATen/native/ForeachUtils.h
@ -102,13 +102,12 @@ inline void check_foreach_api_restrictions(
 // corresponding tensors (aligning in index across the tensorLists) share the
 // same device and dtype.
 inline bool _check_tensors_share_device_and_dtype(
-    ArrayRef<TensorList> tensorLists,
-    const bool skip_dtype_check = false) {
+    ArrayRef<TensorList> tensorLists) {
  const auto expected_dtype = tensorLists[0][0].dtype();
  const auto expected_device = tensorLists[0][0].device();

  auto is_tensor_okay = [&](const Tensor& tensor) {
-    return (skip_dtype_check || tensor.dtype() == expected_dtype) &&
+    return tensor.dtype() == expected_dtype &&
        tensor.device() == expected_device && tensor.layout() == at::kStrided &&
        tensor.is_non_overlapping_and_dense();
  };
--- a/aten/src/ATen/native/Lerp.cpp
+++ b/aten/src/ATen/native/Lerp.cpp
@ -20,9 +20,9 @@ TORCH_META_FUNC(lerp_Tensor)(
              " for `weight` but got dtype ", weight.dtype());
  build(at::TensorIteratorConfig()
        .add_output(maybe_get_output())
-        .add_input(self)
-        .add_input(end)
-        .add_input(weight));
+        .add_const_input(self)
+        .add_const_input(end)
+        .add_const_input(weight));
 }

 TORCH_META_FUNC(lerp_Scalar)(
--- a/aten/src/ATen/native/SegmentReduce.cpp
+++ b/aten/src/ATen/native/SegmentReduce.cpp
@ -52,7 +52,7 @@ void _segment_reduce_lengths_cpu_kernel1(
  AT_DISPATCH_FLOATING_TYPES_AND2(
      kBFloat16, kHalf, data.scalar_type(), "_segment_reduce_cpu", [&]() {
        auto* output_data = output.data_ptr<scalar_t>();
-        const auto* values_data = data.data_ptr<scalar_t>();
+        const auto* values_data = data.const_data_ptr<scalar_t>();
        for (const auto outer_idx : c10::irange(outer_offset)) {
          int64_t segment_start, segment_length;
          int64_t segment_end = is_offsets_like ?
@ -145,7 +145,7 @@ Tensor _segment_reduce_lengths_cpu_kernel(
  auto output = at::empty(output_shape, data.options());

  AT_DISPATCH_INDEX_TYPES(lengths.scalar_type(), "_segment_reduce_lengths_cpu_kernel1", [&]() {
-    const auto* lengths_data = lengths.data_ptr<index_t>();
+    const auto* lengths_data = lengths.const_data_ptr<index_t>();
    _segment_reduce_lengths_cpu_kernel1(
        reduction, data, lengths_data, axis, initial, output, segment_count, lengths_stride_axis);
  });
@ -171,7 +171,7 @@ Tensor _segment_reduce_offsets_cpu_kernel(
  auto output = at::empty(output_shape, data.options());

  AT_DISPATCH_INDEX_TYPES(offsets.scalar_type(), "_segment_reduce_offsets_cpu_kernel1", [&]() {
-    const auto* offsets_data = offsets.data_ptr<index_t>();
+    const auto* offsets_data = offsets.const_data_ptr<index_t>();
    _segment_reduce_lengths_cpu_kernel1<index_t, /*is_offsets_like=*/true>(
        reduction, data, offsets_data, axis, initial, output, segment_count, offsets_stride_axis);
  });
@ -214,7 +214,7 @@ void _segment_reduce_cpu_lengths_backward_kernel1(
        auto* output_data = output_contig.data_ptr<scalar_t>();
        auto* grad_data = grad_contig.data_ptr<scalar_t>();
        auto* grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
-        const auto* values_data = data_contig.data_ptr<scalar_t>();
+        const auto* values_data = data_contig.const_data_ptr<scalar_t>();
        // Used to calculate exclusive prod
        scalar_t initial_prod_value;
        if (reduction == ReductionType::PROD) {
@ -331,7 +331,7 @@ Tensor _segment_reduce_cpu_lengths_backward_kernel(

  AT_DISPATCH_INDEX_TYPES(
      lengths_contig.scalar_type(), "_segment_reduce_cpu_lengths_backward_kernel1", [&] {
-        const auto* lengths_data = lengths_contig.data_ptr<index_t>();
+        const auto* lengths_data = lengths_contig.const_data_ptr<index_t>();
        _segment_reduce_cpu_lengths_backward_kernel1(
            grad_contig,
            output_contig,
@ -364,7 +364,7 @@ Tensor _segment_reduce_cpu_offsets_backward_kernel(

  AT_DISPATCH_INDEX_TYPES(
      offsets_contig.scalar_type(), "_segment_reduce_cpu_offsets_backward_kernel1", [&] {
-        const auto* offsets_data = offsets_contig.data_ptr<index_t>();
+        const auto* offsets_data = offsets_contig.const_data_ptr<index_t>();
        _segment_reduce_cpu_lengths_backward_kernel1<index_t, /*is_offsets_like=*/true>(
            grad_contig,
            output_contig,
--- a/aten/src/ATen/native/cpu/BlasKernel.cpp
+++ b/aten/src/ATen/native/cpu/BlasKernel.cpp
@ -1,5 +1,6 @@
 #define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/Dispatch.h>
+#include <ATen/Parallel.h>
 #include <ATen/native/CPUBlas.h>
 #include <ATen/native/cpu/zmath.h>
 #include <c10/util/irange.h>
@ -337,20 +338,22 @@ void gemm_transa_(
    at::native::blas_impl::fp16_gemv_trans(k, m, alpha, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(b), 1, beta, reinterpret_cast<float16_t*>(c), 1);
    return;
  }
-  const auto *a_ = a;
-  for (const auto i : c10::irange(m)) {
-    const auto *b_ = b;
-    for (const auto j : c10::irange(n)) {
-      const auto dot = compute_dot(reinterpret_cast<const float16_t*>(a_), reinterpret_cast<const float16_t*>(b_), k);
-      b_ += ldb;
-      if (beta == 0) {
-        c[j*ldc+i] = alpha*dot;
-      } else {
-        c[j*ldc+i] = beta*c[j*ldc+i]+alpha*dot;
+  parallel_for(0, m, 1, [&](int64_t begin, int64_t end) {
+    const auto *a_ = a + begin * lda;
+    for (const auto i : c10::irange(begin, end)) {
+      const auto *b_ = b;
+      for (const auto j : c10::irange(n)) {
+        const auto dot = compute_dot(reinterpret_cast<const float16_t*>(a_), reinterpret_cast<const float16_t*>(b_), k);
+        b_ += ldb;
+        if (beta == 0) {
+          c[j*ldc+i] = alpha*dot;
+        } else {
+          c[j*ldc+i] = beta*c[j*ldc+i]+alpha*dot;
+        }
      }
+      a_ += lda;
    }
-    a_ += lda;
-  }
+  });
 }

 #endif
--- a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
+++ b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
@ -292,9 +292,9 @@ Tensor _convolution_depthwise3x3_winograd(
                      bias_potentially_undefined :
                      at::zeros({kernel_sizes[0]}, input.options());

-  auto input_data = input.data_ptr<float>();
-  auto kernel_data = kernel.data_ptr<float>();
-  auto bias_data = bias.data_ptr<float>();
+  auto input_data = input.const_data_ptr<float>();
+  auto kernel_data = kernel.const_data_ptr<float>();
+  auto bias_data = bias.const_data_ptr<float>();
  auto output_data = output.data_ptr<float>();

  at::parallel_for(0, args.batch * args.out_channels, 0, [&](int64_t start, int64_t end) {
--- a/aten/src/ATen/native/cpu/DistributionTemplates.h
+++ b/aten/src/ATen/native/cpu/DistributionTemplates.h
@ -321,7 +321,7 @@ void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG generato
    auto p = expand_inplace(self, p_cpu);
    auto iter = TensorIteratorConfig()
        .add_output(self)
-        .add_input(*p)
+        .add_const_input(*p)
        .check_all_same_dtype(false)
        .build();
    if (p->scalar_type() == kDouble) {
--- a/aten/src/ATen/native/cpu/HistogramKernel.cpp
+++ b/aten/src/ATen/native/cpu/HistogramKernel.cpp
@ -98,14 +98,14 @@ void histogramdd_cpu_contiguous(Tensor& hist, const TensorList& bin_edges,
        return;
    }

-    TensorAccessor<input_t, 2> accessor_in = input.accessor<input_t, 2>();
+    TensorAccessor<const input_t, 2> accessor_in = input.accessor<const input_t, 2>();

    /* Constructs a c10::optional<TensorAccessor> containing an accessor iff
     * the optional weight tensor has a value.
     */
    const auto accessor_wt = weight.has_value()
-            ? c10::optional<TensorAccessor<input_t, 1>>(weight.value().accessor<input_t, 1>())
-            : c10::optional<TensorAccessor<input_t, 1>>();
+            ? c10::optional<TensorAccessor<const input_t, 1>>(weight.value().accessor<const input_t, 1>())
+            : c10::optional<TensorAccessor<const input_t, 1>>();

    std::vector<input_t*> bin_seq(D);
    std::vector<int64_t> num_bin_edges(D);
--- a/aten/src/ATen/native/cpu/MultinomialKernel.cpp
+++ b/aten/src/ATen/native/cpu/MultinomialKernel.cpp
@ -36,7 +36,7 @@ multinomial_with_replacement_apply(
  /* cumulative probability distribution vector */
  Tensor cum_dist = at::empty({n_categories}, self.options());

-  const scalar_t* const self_ptr = self.data_ptr<scalar_t>();
+  const scalar_t* const self_ptr = self.const_data_ptr<scalar_t>();
  scalar_t* const cum_dist_ptr = cum_dist.data_ptr<scalar_t>();
  int64_t* const result_ptr = result.data_ptr<int64_t>();

--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -195,7 +195,7 @@ template <typename scalar_t, typename acc_t=typename scalar_value_type<scalar_t>
 void norm_kernel_cpu_impl(TensorIterator& iter, const double& val) {
  if (val == 0.0) {
    binary_kernel_reduce(iter, NormZeroOps<scalar_t, acc_t, out_t>(), acc_t(0));
-  } else if (val == 0.0) {
+  } else if (val == 1.0) {
    binary_kernel_reduce(iter, NormOneOps<scalar_t, acc_t, out_t>(), acc_t(0));
  } else if (val == 2.0) {
    binary_kernel_reduce(iter, NormTwoOps<scalar_t, acc_t, out_t>(), acc_t(0));
--- a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu
+++ b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu
@ -32,10 +32,10 @@ PackedTensorAccessor32<scalar_t, ndim, PtrTraits> dummy_packed_accessor32() {

 template <int kSize, typename scalar_t, typename index_t>
 __global__ void conv_depthwise2d_forward_kernel(
-    const PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> input,
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> input,
    PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> output,
-    const PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> weight,
-    const PackedTensorAccessor32<scalar_t, 1, DefaultPtrTraits> bias,
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> weight,
+    const PackedTensorAccessor32<const scalar_t, 1, DefaultPtrTraits> bias,
    bool biasEnabled,
    index_t totalElements,
    const int outputChannels,
@ -309,12 +309,12 @@ void conv_depthwise2d_forward_out(
    // Create PackedTensorAccessor
    // Kernel currently relies upon all the Tensors to be contiguous, but we made
    // them contiguous above
-    const auto input_a = input.packed_accessor32<scalar_t, 4>();
-    const auto weight_a = weight.packed_accessor32<scalar_t, 4>();
+    const auto input_a = input.packed_accessor32<const scalar_t, 4>();
+    const auto weight_a = weight.packed_accessor32<const scalar_t, 4>();
    const auto output_a = output.packed_accessor32<scalar_t, 4>();
    const auto bias_a = has_bias ?
-      bias.packed_accessor32<scalar_t, 1>() :
-      dummy_packed_accessor32<scalar_t, 1>();
+      bias.packed_accessor32<const scalar_t, 1>() :
+      dummy_packed_accessor32<const scalar_t, 1>();
    if (kW == 3 && kH == 3) {
      conv_depthwise2d_forward_kernel<3> <<<grid, block, 0, stream>>>(
        input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier,
--- a/aten/src/ATen/native/cuda/DepthwiseConv3d.cu
+++ b/aten/src/ATen/native/cuda/DepthwiseConv3d.cu
@ -26,9 +26,9 @@ template <typename scalar_t, typename accscalar_t,
    int kKnownKernelT, int kKnownKernelH, int kKnownKernelW,
    int kKnownDilationT, int kKnownDilationH, int kKnownDilationW>
 __global__ void conv_depthwise3d_cuda_kernel(
-    const PackedTensorAccessor32<scalar_t, 5> input,
+    const PackedTensorAccessor32<const scalar_t, 5> input,
    PackedTensorAccessor32<scalar_t, 5> output,
-    const PackedTensorAccessor32<scalar_t, 5> kernel,
+    const PackedTensorAccessor32<const scalar_t, 5> kernel,
    const scalar_t* bias,
    int strideT, int strideH, int strideW,
    int paddingT, int paddingH, int paddingW,
@ -361,9 +361,9 @@ void conv_depthwise_shape_check(
    conv_depthwise3d_cuda_kernel                                            \
    <scalar_t, accscalar_t, (kt), (kh), (kw), (dilt), (dilh), (dilw)>       \
      <<<grid, block, (smem), at::cuda::getCurrentCUDAStream()>>>(          \
-        input_.packed_accessor32<scalar_t, 5>(),                            \
+        input_.packed_accessor32<const scalar_t, 5>(),                      \
        output_.packed_accessor32<scalar_t, 5>(),                           \
-        weight_.packed_accessor32<scalar_t, 5>(),                           \
+        weight_.packed_accessor32<const scalar_t, 5>(),                     \
        bias_ptr,                                                           \
        stride[0], stride[1], stride[2],                                    \
        padding[0], padding[1], padding[2],                                 \
@ -377,9 +377,9 @@ void conv_depthwise_shape_check(
    conv_depthwise3d_cuda_kernel                                            \
    <scalar_t,accscalar_t, -1, -1, -1, -1, -1, -1>                          \
      <<<grid, block, (smem), at::cuda::getCurrentCUDAStream()>>>(          \
-        input_.packed_accessor32<scalar_t, 5>(),                            \
+        input_.packed_accessor32<const scalar_t, 5>(),                      \
        output_.packed_accessor32<scalar_t, 5>(),                           \
-        weight_.packed_accessor32<scalar_t, 5>(),                           \
+        weight_.packed_accessor32<const scalar_t, 5>(),                     \
        bias_ptr,                                                           \
        stride[0], stride[1], stride[2],                                    \
        padding[0], padding[1], padding[2],                                 \
--- a/aten/src/ATen/native/cuda/DistributionTemplates.h
+++ b/aten/src/ATen/native/cuda/DistributionTemplates.h
@ -618,7 +618,7 @@ void bernoulli_tensor_cuda_kernel(
      };
  // The template argument `4` below indicates that we want to operate on four
  // element at each time. See NOTE [ CUDA_tensor_applyN helpers ] for details.
-  at::cuda::CUDA_tensor_apply2<scalar_t, prob_t, 4, decltype(functor),
+  at::cuda::CUDA_tensor_apply2<scalar_t, const prob_t, 4, decltype(functor),
                               /*max_threads_per_block=*/512,
                               /*min_blocks_per_sm==*/2>(ret, p, functor);
 }
--- a/aten/src/ATen/native/cuda/Dropout.cu
+++ b/aten/src/ATen/native/cuda/Dropout.cu
@ -187,8 +187,8 @@ void masked_scale_kernel(at::Tensor& ret, const at::Tensor& src, const at::Tenso
   auto iter = at::TensorIteratorConfig()
     .check_all_same_dtype(false)
     .add_output(ret)
-     .add_input(src)
-     .add_input(mask)
+     .add_const_input(src)
+     .add_const_input(mask)
     .build();

   at::native::gpu_kernel(
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
@ -4,7 +4,6 @@
 #include <ATen/native/cuda/ForeachFunctors.cuh>
 #include <ATen/native/cuda/ForeachMinMaxFunctors.cuh>
 #include <functional>
-#include <type_traits>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/NativeFunctions.h>
@ -251,152 +250,20 @@ FOREACH_BINARY_OP_LIST(
    power_functor,
    /*division_op*/ true);

-template <typename dst_t, typename src_t = dst_t>
-struct Copy {
-  __device__ __forceinline__ dst_t operator()(const src_t& x) {
-    return static_cast<dst_t>(x);
+template <typename T>
+struct Identity {
+  __device__ __forceinline__ T operator()(const T& x) {
+    return x;
  }
 };

-template <typename dst_t>
-struct Copy<dst_t, c10::complex<double>> {
-  __device__ __forceinline__ dst_t operator()(const c10::complex<double>& x) {
-    if constexpr (!(std::is_same_v<dst_t, c10::complex<double>> ||
-                    std::is_same_v<dst_t, c10::complex<float>>)) {
-      return static_cast<dst_t>(x.real());
-    } else {
-      return static_cast<dst_t>(x);
-    }
-  }
-};
-
-template <typename dst_t>
-struct Copy<dst_t, c10::complex<float>> {
-  __device__ __forceinline__ dst_t operator()(const c10::complex<float>& x) {
-    if constexpr (!(std::is_same_v<dst_t, c10::complex<double>> ||
-                    std::is_same_v<dst_t, c10::complex<float>>)) {
-      return static_cast<dst_t>(x.real());
-    } else {
-      return static_cast<dst_t>(x);
-    }
-  }
-};
-
-#define AT_DISPATCH_SOURCE_TYPES(TYPE, NAME, ...)                              \
-  AT_DISPATCH_SWITCH(                                                          \
-      TYPE,                                                                    \
-      NAME,                                                                    \
-      AT_PRIVATE_CASE_TYPE_USING_HINT(                                         \
-          at::ScalarType::Byte, src_t, __VA_ARGS__)                            \
-          AT_PRIVATE_CASE_TYPE_USING_HINT(                                     \
-              at::ScalarType::Char, src_t, __VA_ARGS__)                        \
-              AT_PRIVATE_CASE_TYPE_USING_HINT(                                 \
-                  at::ScalarType::Long, src_t, __VA_ARGS__)                    \
-                  AT_PRIVATE_CASE_TYPE_USING_HINT(                             \
-                      at::ScalarType::Short, src_t, __VA_ARGS__)               \
-                      AT_PRIVATE_CASE_TYPE_USING_HINT(                         \
-                          at::ScalarType::Double, src_t, __VA_ARGS__)          \
-                          AT_PRIVATE_CASE_TYPE_USING_HINT(                     \
-                              at::ScalarType::Float, src_t, __VA_ARGS__)       \
-                              AT_PRIVATE_CASE_TYPE_USING_HINT(                 \
-                                  at::ScalarType::ComplexDouble,               \
-                                  src_t,                                       \
-                                  __VA_ARGS__)                                 \
-                                  AT_PRIVATE_CASE_TYPE_USING_HINT(             \
-                                      at::ScalarType::ComplexFloat,            \
-                                      src_t,                                   \
-                                      __VA_ARGS__)                             \
-                                      AT_PRIVATE_CASE_TYPE_USING_HINT(         \
-                                          at::ScalarType::Half,                \
-                                          src_t,                               \
-                                          __VA_ARGS__)                         \
-                                          AT_PRIVATE_CASE_TYPE_USING_HINT(     \
-                                              at::ScalarType::BFloat16,        \
-                                              src_t,                           \
-                                              __VA_ARGS__)                     \
-                                              AT_PRIVATE_CASE_TYPE_USING_HINT( \
-                                                  at::ScalarType::Bool,        \
-                                                  src_t,                       \
-                                                  __VA_ARGS__))
-
-namespace {
-
-template <
-    typename T,
-    typename src_t,
-    int depth,
-    int r_args_depth,
-    int res_arg_index>
-struct CopyFunctor {
-  static_assert(depth == 2 && r_args_depth == 1 && res_arg_index == 1);
-  template <typename Op>
-  __device__ __forceinline__ void operator()(
-      int chunk_size,
-      TensorListMetadata<depth>& tl,
-      Op op) {
-    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
-    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
-    auto n = tl.numel_for_tensor[tensor_loc];
-
-    src_t* src_ptr = (src_t*)tl.addresses[0][tensor_loc];
-    src_ptr += chunk_idx * chunk_size;
-    T* self_ptr = (T*)tl.addresses[1][tensor_loc];
-    self_ptr += chunk_idx * chunk_size;
-
-    const bool all_aligned{is_aligned(src_ptr) && is_aligned(self_ptr)};
-
-    n -= chunk_idx * chunk_size;
-    src_t src_args[kILP];
-    T r_args[kILP];
-
-    // to make things simple, we put aligned case in a different code path
-    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
-      for (int64_t i_start = threadIdx.x;
-           i_start * kILP < n && i_start * kILP < chunk_size;
-           i_start += blockDim.x) {
-        // load
-        load_store(src_args, src_ptr, 0, i_start);
-#pragma unroll
-        for (int ii = 0; ii < kILP; ii++) {
-          r_args[ii] = static_cast<T>(op(src_args[ii]));
-        }
-        // store
-        load_store(self_ptr, r_args, i_start, 0);
-      }
-    } else {
-      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
-           i_start += blockDim.x * kILP) {
-#pragma unroll
-        for (int ii = 0; ii < kILP; ii++) {
-          const auto i = i_start + threadIdx.x + ii * blockDim.x;
-          src_args[ii] = src_ptr[i];
-        }
-#pragma unroll
-        for (int ii = 0; ii < kILP; ii++) {
-          r_args[ii] = static_cast<T>(op(src_args[ii]));
-        }
-        store_args(self_ptr, r_args, i_start, chunk_size, n);
-      }
-    }
-  }
-};
-
-} // anonymous namespace
-
 void foreach_tensor_copy_list_kernel_cuda_(
    TensorList self,
    TensorList src,
    const bool non_blocking) {
  check_foreach_api_restrictions(self, src);
-  if (!(_check_tensors_share_device_and_dtype(
-            {self, src}, /* skip_dtype_check */ true) &&
-        std::all_of(
-            src.cbegin(),
-            src.cend(),
-            [&](const auto& t) -> bool {
-              return t.dtype() == src[0].dtype();
-            }) &&
-        _check_tensors_share_sizes_and_strides({self, src}))) {
+  if (!can_use_fast_route(
+          self, src, /* does_op_promote_integer_inputs_to_float */ false)) {
    return at::native::foreach_tensor_copy_list_kernel_slow_(
        self, src, non_blocking);
  }
@ -411,38 +278,16 @@ void foreach_tensor_copy_list_kernel_cuda_(
      "foreach_tensor_copy",
      [&]() {
        using opmath_t = at::opmath_type<scalar_t>;
-        AT_DISPATCH_SOURCE_TYPES(src[0].scalar_type(), "foreach_tensor_copy", [&] {
-          if constexpr (std::is_same_v<scalar_t, src_t>) {
-            multi_tensor_apply<2>(
-                tensor_lists,
-                UnaryOpFunctor<
-                    scalar_t,
-                    /* depth */ 2,
-                    /* r_args_depth */ 1,
-                    /* res_arg_index */ 1>(),
-                Copy<opmath_t, opmath_t>());
-          } else {
-            // Ref:
-            // https://github.com/pytorch/pytorch/blob/656134c38f4737d13c3f43fc5c59470bc23c1d2f/aten/src/ATen/native/Copy.cpp#L299-L301
-            if (!self[0].is_complex() && src[0].is_complex()) {
-              TORCH_WARN_ONCE(
-                  "Casting complex values to real discards the imaginary part");
-            }
-            multi_tensor_apply<2>(
-                tensor_lists,
-                CopyFunctor<
-                    scalar_t,
-                    src_t,
-                    /* depth */ 2,
-                    /* r_args_depth */ 1,
-                    /* res_arg_index */ 1>(),
-                Copy<scalar_t, src_t>());
-          }
-        });
+        multi_tensor_apply<2>(
+            tensor_lists,
+            UnaryOpFunctor<
+                scalar_t,
+                /* depth */ 2,
+                /* r_args_depth */ 1,
+                /* res_arg_index */ 1>(),
+            Identity<opmath_t>());
      });
  increment_version(self);
 }

-#undef AT_DISPATCH_SOURCE_TYPES
-
 } // namespace at::native
--- a/aten/src/ATen/native/cuda/SummaryOps.cu
+++ b/aten/src/ATen/native/cuda/SummaryOps.cu
@ -65,7 +65,7 @@ C10_LAUNCH_BOUNDS_1(cuda::getApplyBlockSize())
 __global__ void kernelHistogram1D(
    detail::TensorInfo<output_t, IndexType> a, /* output */
    detail::TensorInfo<output_t, IndexType> p, /* partial output */
-    detail::TensorInfo<input_t, IndexType> b, /* input */
+    detail::TensorInfo<const input_t, IndexType> b, /* input */
    int64_t nbins,
    at::acc_type<input_t, /*is_cuda=*/true> minvalue,
    at::acc_type<input_t, /*is_cuda=*/true> maxvalue,
@ -86,7 +86,7 @@ __global__ void kernelHistogram1D(
    FOR_KERNEL_LOOP(linearIndex, totalElements) {
      // Convert `linearIndex` into an offset of `b`
      const IndexType bOffset =
-          detail::IndexToOffset<input_t, IndexType, BDims>::get(linearIndex, b);
+          detail::IndexToOffset<const input_t, IndexType, BDims>::get(linearIndex, b);
      const auto bVal = b.data[bOffset];
      if (bVal >= minvalue && bVal <= maxvalue) {
        // Use value at `b` as an offset of `smem`
@ -112,7 +112,7 @@ __global__ void kernelHistogram1D(
    FOR_KERNEL_LOOP(linearIndex, totalElements) {
      // Convert `linearIndex` into an offset of `b`
      const IndexType bOffset =
-          detail::IndexToOffset<input_t, IndexType, BDims>::get(linearIndex, b);
+          detail::IndexToOffset<const input_t, IndexType, BDims>::get(linearIndex, b);
      const auto bVal = b.data[bOffset];
      if (bVal >= minvalue && bVal <= maxvalue) {
        // Use value at `b` as an offset of `a`
@ -219,7 +219,7 @@ bool CUDA_tensor_histogram(

  using IndexType = int64_t;
  auto aInfo = detail::getTensorInfo<output_t, IndexType>(a);
-  auto bInfo = detail::getTensorInfo<input_t, IndexType>(b);
+  auto bInfo = detail::getTensorInfo<const input_t, IndexType>(b);
  detail::TensorInfo<output_t, IndexType> pInfo(nullptr, 0, {}, {});

  if (HasWeights) {
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@ -1500,7 +1500,11 @@ NvrtcFunction jit_pwise_function(
    std::stringstream ss;
    ss << *cache_dir << "/";
    ss << kernel_name;
+#ifdef USE_ROCM
+    ss << "_arch" << prop->gcnArchName;
+#else
    ss << "_arch" << cuda_major << "." << cuda_minor;
+#endif
    ss << "_nvrtc" << nvrtc_major << "." << nvrtc_minor;
    ss << (compile_to_sass ? "_sass" : "_ptx");
    ss << "_" << code.length();
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
@ -1078,8 +1078,8 @@ static void apply_ormqr(const Tensor& input, const Tensor& tau, const Tensor& ot
  auto side = left ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
  auto trans = transpose ? (input.is_complex() ? CUBLAS_OP_C : CUBLAS_OP_T) : CUBLAS_OP_N;

-  auto input_data = input.data_ptr<scalar_t>();
-  auto tau_data = tau.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
+  auto tau_data = tau.const_data_ptr<scalar_t>();
  auto other_data = other.data_ptr<scalar_t>();

  auto input_matrix_stride = matrixStride(input);
@ -1101,9 +1101,9 @@ static void apply_ormqr(const Tensor& input, const Tensor& tau, const Tensor& ot
  auto info_data = info.data_ptr<int>();

  for (auto i = decltype(batch_size){0}; i < batch_size; i++) {
-    scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
+    const scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
    scalar_t* other_working_ptr = &other_data[i * other_matrix_stride];
-    scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
+    const scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
    auto handle = at::cuda::getCurrentCUDASolverDnHandle();

    // allocate workspace storage
--- a/aten/src/ATen/native/cudnn/Conv_v7.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp
@ -95,9 +95,9 @@ std::ostream& operator<<(std::ostream& out, const ConvolutionArgs& args) {
      << "weight: " << args.wdesc // already has a trailing newline
      << "Pointer addresses: "
      << "\n"
-      << "    input: " << args.input.data_ptr() << "\n"
-      << "    output: " << args.output.data_ptr() << "\n"
-      << "    weight: " << args.weight.data_ptr() << "\n";
+      << "    input: " << args.input.const_data_ptr() << "\n"
+      << "    output: " << args.output.const_data_ptr() << "\n"
+      << "    weight: " << args.weight.const_data_ptr() << "\n";

  return out;
 }
@ -306,9 +306,9 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
          cudnnFindConvolutionForwardAlgorithmEx(
              args.handle,
              args.idesc.desc(),
-              args.input.data_ptr(),
+              args.input.const_data_ptr(),
              args.wdesc.desc(),
-              args.weight.data_ptr(),
+              args.weight.const_data_ptr(),
              args.cdesc.desc(),
              args.odesc.desc(),
              args.output.data_ptr(),
@ -390,9 +390,9 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
          cudnnFindConvolutionBackwardDataAlgorithmEx(
              args.handle,
              args.wdesc.desc(),
-              args.weight.data_ptr(),
+              args.weight.const_data_ptr(),
              args.odesc.desc(),
-              args.output.data_ptr(),
+              args.output.const_data_ptr(),
              args.cdesc.desc(),
              args.idesc.desc(),
              args.input.data_ptr(),
@ -478,9 +478,9 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
          cudnnFindConvolutionBackwardFilterAlgorithmEx(
              args.handle,
              args.idesc.desc(),
-              args.input.data_ptr(),
+              args.input.const_data_ptr(),
              args.odesc.desc(),
-              args.output.data_ptr(),
+              args.output.const_data_ptr(),
              args.cdesc.desc(),
              args.wdesc.desc(),
              args.weight.data_ptr(),
@ -760,9 +760,9 @@ void raw_cudnn_convolution_forward_out_32bit(
                args.handle,
                &one,
                args.idesc.desc(),
-                input.data_ptr(),
+                input.const_data_ptr(),
                args.wdesc.desc(),
-                weight.data_ptr(),
+                weight.const_data_ptr(),
                args.cdesc.desc(),
                fwdAlgPerf.algo,
                workspace.data_ptr(),
@ -871,9 +871,9 @@ void raw_cudnn_convolution_backward_input_out_32bit(
                args.handle,
                &one,
                args.wdesc.desc(),
-                weight.data_ptr(),
+                weight.const_data_ptr(),
                args.odesc.desc(),
-                grad_output.data_ptr(),
+                grad_output.const_data_ptr(),
                args.cdesc.desc(),
                bwdDataAlgPerf.algo,
                workspace.data_ptr(),
@ -884,7 +884,7 @@ void raw_cudnn_convolution_backward_input_out_32bit(
            args,
            "Additional pointer addresses: \n",
            "    grad_output: ",
-            grad_output.data_ptr(),
+            grad_output.const_data_ptr(),
            "\n",
            "    grad_input: ",
            grad_input.mutable_data_ptr(),
@ -990,9 +990,9 @@ void raw_cudnn_convolution_backward_weight_out_32bit(
                    args.handle,
                    &one,
                    args.idesc.desc(),
-                    input.data_ptr(),
+                    input.const_data_ptr(),
                    args.odesc.desc(),
-                    grad_output.data_ptr(),
+                    grad_output.const_data_ptr(),
                    args.cdesc.desc(),
                    bwdFilterAlgPerf.algo,
                    workspace.data_ptr(),
@ -1003,7 +1003,7 @@ void raw_cudnn_convolution_backward_weight_out_32bit(
                args,
                "Additional pointer addresses: \n",
                "    grad_output: ",
-                grad_output.data_ptr(),
+                grad_output.const_data_ptr(),
                "\n",
                "    grad_weight: ",
                grad_weight.data_ptr(),
@ -1173,18 +1173,18 @@ void raw_cudnn_convolution_add_relu_out_v7(
                args.handle,
                &one,
                args.idesc.desc(),
-                input.data_ptr(),
+                input.const_data_ptr(),
                args.wdesc.desc(),
-                weight.data_ptr(),
+                weight.const_data_ptr(),
                args.cdesc.desc(),
                fwdAlgPerf.algo,
                workspace.data_ptr(),
                fwdAlgPerf.memory,
                &alpha_,
                zdesc.desc(),
-                z.data_ptr(),
+                z.const_data_ptr(),
                bdesc.desc(),
-                bias.data_ptr(),
+                bias.const_data_ptr(),
                adesc.desc(),
                args.odesc.desc(),
                output.data_ptr()),
--- a/aten/src/ATen/native/cudnn/Conv_v8.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp
@ -52,7 +52,7 @@ constexpr int64_t operator"" _TiB(unsigned long long n) {
 uint8_t getAlignment(const Tensor& t) {
  // alignment are in bytes
  uint8_t alignment = 1;
-  uintptr_t address = reinterpret_cast<uintptr_t>(t.data_ptr());
+  uintptr_t address = reinterpret_cast<uintptr_t>(t.const_data_ptr());
  for (; alignment < 32; alignment *= 2) {
    if (address % (alignment * 2)) {
      return alignment;
@ -358,12 +358,30 @@ void run_conv_plan(
    const Tensor& x,
    const Tensor& y,
    const Tensor& w,
-    const cudnn_frontend::ExecutionPlan& plan) {
+    const cudnn_frontend::ExecutionPlan& plan,
+    const cudnnBackendDescriptorType_t operation) {
  c10::DeviceGuard g(x.options().device());
  auto workspace_size = plan.getWorkspaceSize();
  auto workspace_ptr =
      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
-  void* data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr()};
+  void* data_ptrs[3];
+
+  if (operation == CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR) {
+    data_ptrs[0] = const_cast<void*>(x.const_data_ptr());
+    data_ptrs[1] = y.data_ptr();
+    data_ptrs[2] = const_cast<void*>(w.const_data_ptr());
+  } else if (
+      operation ==
+      CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR) {
+    data_ptrs[0] = x.data_ptr();
+    data_ptrs[1] = const_cast<void*>(y.const_data_ptr());
+    data_ptrs[2] = const_cast<void*>(w.const_data_ptr());
+  } else {
+    data_ptrs[0] = x.data_ptr();
+    data_ptrs[1] = y.data_ptr();
+    data_ptrs[2] = w.data_ptr();
+  }
+
  int64_t uids[] = {'x', 'y', 'w'};
  auto variantPack =
      cudnn_frontend::VariantPackBuilder()
@ -843,10 +861,11 @@ void try_plans(
    const cudnnHandle_t handle,
    const Tensor& x,
    const Tensor& y,
-    const Tensor& w) {
+    const Tensor& w,
+    const cudnnBackendDescriptorType_t operation) {
  for (auto& plan : plans) {
    try {
-      run_conv_plan(handle, x, y, w, plan);
+      run_conv_plan(handle, x, y, w, plan, operation);
      benchmark_cache.update(key, plan);
      return;
    } catch (cudnn_frontend::cudnnException& e) {
@ -890,7 +909,8 @@ bool try_configs(
    const cudnnHandle_t handle,
    const Tensor& x,
    const Tensor& y,
-    const Tensor& w) {
+    const Tensor& w,
+    const cudnnBackendDescriptorType_t operation) {
  for (auto& config : configs) {
    try {
      auto plan = cudnn_frontend::ExecutionPlanBuilder()
@ -900,7 +920,7 @@ bool try_configs(
      if (plan_errata_exception(handle, plan.getTag())) {
        continue;
      }
-      run_conv_plan(handle, x, y, w, plan);
+      run_conv_plan(handle, x, y, w, plan, operation);
      benchmark_cache.update(key, plan);
      return true;
    } catch (cudnn_frontend::cudnnException& e) {
@ -971,7 +991,7 @@ void run_single_conv(
  auto search = benchmark_cache.find(key);
  if (search) {
    try {
-      run_conv_plan(handle, x, y, w, *search);
+      run_conv_plan(handle, x, y, w, *search, operation);
      return;
    } catch (c10::OutOfMemoryError& e) {
      (void)cudaGetLastError(); // clear CUDA error
@ -994,7 +1014,7 @@ void run_single_conv(
        deterministic,
        allow_tf32,
        false);
-    if (try_configs(configs, opgraph_tag, key, handle, x, y, w)) {
+    if (try_configs(configs, opgraph_tag, key, handle, x, y, w, operation)) {
      return;
    }
    // fallback configs
@ -1012,7 +1032,7 @@ void run_single_conv(
        deterministic,
        allow_tf32,
        true);
-    if (try_configs(configs, opgraph_tag, key, handle, x, y, w)) {
+    if (try_configs(configs, opgraph_tag, key, handle, x, y, w, operation)) {
      return;
    }
    TORCH_CHECK(
@ -1035,7 +1055,7 @@ void run_single_conv(
    if (at::native::_cudnn_get_conv_benchmark_empty_cache()) {
      c10::cuda::CUDACachingAllocator::emptyCache();
    }
-    try_plans(plans, key, handle, x, y, w);
+    try_plans(plans, key, handle, x, y, w, operation);
  }
 }

--- a/aten/src/ATen/native/layer_norm.cpp
+++ b/aten/src/ATen/native/layer_norm.cpp
@ -2,6 +2,7 @@
 #include <ATen/native/layer_norm.h>

 #include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/native/cpu/mixed_data_type.h>
 #include <c10/util/irange.h>
@ -18,6 +19,9 @@
 #include <ATen/ops/native_layer_norm.h>
 #include <ATen/ops/native_layer_norm_backward_native.h>
 #include <ATen/ops/native_layer_norm_native.h>
+#include <ATen/ops/pow.h>
+#include <ATen/ops/rsqrt.h>
+#include <ATen/ops/rms_norm.h>
 #include <ATen/ops/zeros_like_native.h>
 #endif

@ -258,4 +262,49 @@ std::tuple<Tensor, Tensor, Tensor> math_native_layer_norm(
  rstd = rstd.view(stat_shape);
  return std::make_tuple(out, mean, rstd);
 }
+
+Tensor rms_norm(
+    const Tensor& input,
+    IntArrayRef normalized_shape,
+    const c10::optional<Tensor>& weight_opt /* optional */,
+    c10::optional<double> eps) {
+
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+  auto bias_opt = at::optional<Tensor>();
+  const Tensor& bias = *at::borrow_from_optional_tensor(bias_opt);
+  (void) _check_layer_norm_inputs(input, normalized_shape, weight, bias);
+
+  std::vector<int64_t> dims_to_reduce;
+  for (const auto i : c10::irange(normalized_shape.size())) {
+    dims_to_reduce.push_back(input.dim() - i - 1);
+  }
+  IntArrayRef dims_to_reduce_ref = IntArrayRef(dims_to_reduce);
+
+  auto result = AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        input.scalar_type(),
+        "rms_norm",
+        [&] {
+    scalar_t eps_val;
+    if (!eps.has_value()) {
+      eps_val = std::numeric_limits<at::scalar_value_type<scalar_t>::type>::epsilon();
+    } else {
+      eps_val = eps.value();
+    }
+
+    auto result = input.mul(at::rsqrt(at::pow(input, 2).mean(dims_to_reduce_ref, /*keep_dim=*/true).add_(eps_val)));
+
+    if (weight_opt.has_value()) {
+      result = result.mul(weight_opt.value());
+    }
+
+    return result;
+  });
+
+  return result;
+
+}
 } // namespace at::native
--- a/aten/src/ATen/native/layer_norm.h
+++ b/aten/src/ATen/native/layer_norm.h
@ -71,6 +71,12 @@ void layer_norm_cpu_out(
    int64_t M,
    int64_t N);

+Tensor rms_norm(
+    const Tensor& input,
+    IntArrayRef normalized_shape,
+    const c10::optional<Tensor>& weight_opt /* optional */,
+    c10::optional<double> eps);
+
 using forward_fn = void (*)(
    const Tensor& /* X */,
    const Tensor& /* gamma */,
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@ -371,8 +371,8 @@ struct algorithm_search<miopenConvFwdAlgorithm_t> {
    Workspace ws(max_ws_size);
    MIOPEN_CHECK(miopenFindConvolutionForwardAlgorithm(
        args.handle,
-        args.idesc.desc(), args.input.data_ptr(),
-        args.wdesc.desc(), args.weight.data_ptr(),
+        args.idesc.desc(), args.input.const_data_ptr(),
+        args.wdesc.desc(), args.weight.const_data_ptr(),
        args.cdesc.desc(),
        args.odesc.desc(), args.output.data_ptr(),
        1,        // just return the fastest
@ -444,8 +444,8 @@ struct algorithm_search<miopenConvBwdDataAlgorithm_t> {
    Workspace ws(max_ws_size);
    MIOPEN_CHECK(miopenFindConvolutionBackwardDataAlgorithm(
        args.handle,
-        args.odesc.desc(), args.output.data_ptr(),
-        args.wdesc.desc(), args.weight.data_ptr(),
+        args.odesc.desc(), args.output.const_data_ptr(),
+        args.wdesc.desc(), args.weight.const_data_ptr(),
        args.cdesc.desc(),
        args.idesc.desc(), args.input.data_ptr(),
        1,      // just return the fastest
@ -517,8 +517,8 @@ struct algorithm_search<miopenConvBwdWeightsAlgorithm_t> {
    Workspace ws(max_ws_size);
    MIOPEN_CHECK(miopenFindConvolutionBackwardWeightsAlgorithm(
        args.handle,
-        args.odesc.desc(), args.output.data_ptr(),
-        args.idesc.desc(), args.input.data_ptr(),
+        args.odesc.desc(), args.output.const_data_ptr(),
+        args.idesc.desc(), args.input.const_data_ptr(),
        args.cdesc.desc(),
        args.wdesc.desc(), args.weight.data_ptr(),
        1,      // just return the fastest
@ -682,7 +682,7 @@ void miopen_convolution_add_bias_(CheckedFrom c, const TensorArg& output, const
  Constant one(dataType, 1);
  Constant zero(dataType, 0);

-  MIOPEN_CHECK(miopenConvolutionForwardBias(handle, &one, bdesc.desc(), bias->data_ptr(),
+  MIOPEN_CHECK(miopenConvolutionForwardBias(handle, &one, bdesc.desc(), bias->const_data_ptr(),
                                     &zero, odesc.desc(), output->data_ptr()));
  */
 }
@ -730,8 +730,8 @@ void raw_miopen_convolution_forward_out(

      MIOPEN_CHECK(miopenConvolutionForward(
        args.handle,
-        &one, args.idesc.desc(), input.data_ptr(),
-        args.wdesc.desc(), weight.data_ptr(),
+        &one, args.idesc.desc(), input.const_data_ptr(),
+        args.wdesc.desc(), weight.const_data_ptr(),
        args.cdesc.desc(), fwdAlg, &zero,
        args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size));
  }
@ -741,8 +741,8 @@ void raw_miopen_convolution_forward_out(

      MIOPEN_CHECK(miopenConvolutionForwardImmediate(
        args.handle,
-        args.wdesc.desc(), weight.data_ptr(),
-        args.idesc.desc(), input.data_ptr(),
+        args.wdesc.desc(), weight.const_data_ptr(),
+        args.idesc.desc(), input.const_data_ptr(),
        args.cdesc.desc(),
        args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size, solution_id));
  }
@ -838,8 +838,8 @@ void raw_miopen_depthwise_convolution_forward_out(

      MIOPEN_CHECK(miopenConvolutionForward(
        args.handle,
-        &one, args.idesc.desc(), input.data_ptr(),
-        args.wdesc.desc(), weight.data_ptr(),
+        &one, args.idesc.desc(), input.const_data_ptr(),
+        args.wdesc.desc(), weight.const_data_ptr(),
        args.cdesc.desc(), fwdAlg, &zero,
        args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size));
  }
@ -849,8 +849,8 @@ void raw_miopen_depthwise_convolution_forward_out(

      MIOPEN_CHECK(miopenConvolutionForwardImmediate(
        args.handle,
-        args.wdesc.desc(), weight.data_ptr(),
-        args.idesc.desc(), input.data_ptr(),
+        args.wdesc.desc(), weight.const_data_ptr(),
+        args.idesc.desc(), input.const_data_ptr(),
        args.cdesc.desc(),
        args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size, solution_id));
  }
@ -993,8 +993,8 @@ void raw_miopen_convolution_backward_weight_out(

      MIOPEN_CHECK(miopenConvolutionBackwardWeights(
          args.handle,
-          &one, args.odesc.desc(), grad_output.data_ptr(),
-          args.idesc.desc(), input.data_ptr(),
+          &one, args.odesc.desc(), grad_output.const_data_ptr(),
+          args.idesc.desc(), input.const_data_ptr(),
          args.cdesc.desc(), bwdFilterAlg, &zero,
          args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size));
  }
@ -1004,8 +1004,8 @@ void raw_miopen_convolution_backward_weight_out(

      MIOPEN_CHECK(miopenConvolutionBackwardWeightsImmediate(
          args.handle,
-          args.odesc.desc(), grad_output.data_ptr(),
-          args.idesc.desc(), input.data_ptr(),
+          args.odesc.desc(), grad_output.const_data_ptr(),
+          args.idesc.desc(), input.const_data_ptr(),
          args.cdesc.desc(),
          args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size, solution_id));
  }
@ -1037,8 +1037,8 @@ void raw_miopen_depthwise_convolution_backward_weight_out(

      MIOPEN_CHECK(miopenConvolutionBackwardWeights(
          args.handle,
-          &one, args.odesc.desc(), grad_output.data_ptr(),
-          args.idesc.desc(), input.data_ptr(),
+          &one, args.odesc.desc(), grad_output.const_data_ptr(),
+          args.idesc.desc(), input.const_data_ptr(),
          args.cdesc.desc(), bwdFilterAlg, &zero,
          args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size));
  }
@ -1048,8 +1048,8 @@ void raw_miopen_depthwise_convolution_backward_weight_out(

      MIOPEN_CHECK(miopenConvolutionBackwardWeightsImmediate(
          args.handle,
-          args.odesc.desc(), grad_output.data_ptr(),
-          args.idesc.desc(), input.data_ptr(),
+          args.odesc.desc(), grad_output.const_data_ptr(),
+          args.idesc.desc(), input.const_data_ptr(),
          args.cdesc.desc(),
          args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size, solution_id));
  }
@ -1242,8 +1242,8 @@ void raw_miopen_convolution_backward_input_out(

      MIOPEN_CHECK(miopenConvolutionBackwardData(
          args.handle,
-          &one, args.odesc.desc(), grad_output.data_ptr(),
-          args.wdesc.desc(), weight.data_ptr(),
+          &one, args.odesc.desc(), grad_output.const_data_ptr(),
+          args.wdesc.desc(), weight.const_data_ptr(),
          args.cdesc.desc(), bwdDataAlg, &zero,
          args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size));
  }
@ -1253,8 +1253,8 @@ void raw_miopen_convolution_backward_input_out(

      MIOPEN_CHECK(miopenConvolutionBackwardDataImmediate(
          args.handle,
-          args.odesc.desc(), grad_output.data_ptr(),
-          args.wdesc.desc(), weight.data_ptr(),
+          args.odesc.desc(), grad_output.const_data_ptr(),
+          args.wdesc.desc(), weight.const_data_ptr(),
          args.cdesc.desc(),
          args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size, solution_id));
  }
@ -1351,8 +1351,8 @@ void raw_miopen_depthwise_convolution_backward_input_out(

      MIOPEN_CHECK(miopenConvolutionBackwardData(
          args.handle,
-          &one, args.odesc.desc(), grad_output.data_ptr(),
-          args.wdesc.desc(), weight.data_ptr(),
+          &one, args.odesc.desc(), grad_output.const_data_ptr(),
+          args.wdesc.desc(), weight.const_data_ptr(),
          args.cdesc.desc(), bwdDataAlg, &zero,
          args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size));
  }
@ -1362,8 +1362,8 @@ void raw_miopen_depthwise_convolution_backward_input_out(

      MIOPEN_CHECK(miopenConvolutionBackwardDataImmediate(
          args.handle,
-          args.odesc.desc(), grad_output.data_ptr(),
-          args.wdesc.desc(), weight.data_ptr(),
+          args.odesc.desc(), grad_output.const_data_ptr(),
+          args.wdesc.desc(), weight.const_data_ptr(),
          args.cdesc.desc(),
          args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size, solution_id));
  }
@ -1528,11 +1528,11 @@ void raw_miopen_convolution_relu_out(
  float activ_gamma = static_cast<float>(0);
  miopenOperatorArgs_t fusionArgs;
  MIOPEN_CHECK(miopenCreateOperatorArgs(&fusionArgs));
-  MIOPEN_CHECK(miopenSetOpArgsConvForward(fusionArgs, convoOp, &alpha, &beta, weight.data_ptr()));
-  MIOPEN_CHECK(miopenSetOpArgsBiasForward(fusionArgs, biasOp, &alpha, &beta, bias.data_ptr()));
+  MIOPEN_CHECK(miopenSetOpArgsConvForward(fusionArgs, convoOp, &alpha, &beta, weight.const_data_ptr()));
+  MIOPEN_CHECK(miopenSetOpArgsBiasForward(fusionArgs, biasOp, &alpha, &beta, bias.const_data_ptr()));
  MIOPEN_CHECK(miopenSetOpArgsActivForward(fusionArgs, activOp, &alpha, &beta, activ_alpha, activ_beta, activ_gamma));

-  miopenExecuteFusionPlan(args.handle, fusePlanDesc, args.idesc.desc(), input.data_ptr(), args.odesc.desc(), output.data_ptr(), fusionArgs);
+  miopenExecuteFusionPlan(args.handle, fusePlanDesc, args.idesc.desc(), input.const_data_ptr(), args.odesc.desc(), output.data_ptr(), fusionArgs);

  // Cleanup
  miopenDestroyFusionPlan(fusePlanDesc);
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@ -223,10 +223,10 @@ static void _mkldnn_convolution_out (
  auto memory_format = mkldnn_convolution_memory_format(input_t.ndimension(), is_channels_last);
  auto input = input_t.is_mkldnn() ? input_t : input_t.contiguous(memory_format);
  auto weight = weight_t.is_mkldnn() ? weight_t : weight_t.contiguous(memory_format);
-  const ideep::tensor x = itensor_from_tensor(input);
-  const ideep::tensor w = itensor_from_tensor(weight);
+  const ideep::tensor x = itensor_from_tensor(input, /*from_const_data_ptr*/true);
+  const ideep::tensor w = itensor_from_tensor(weight, /*from_const_data_ptr*/true);
  if (bias.defined()) {
-    const ideep::tensor b = itensor_from_tensor(bias);
+    const ideep::tensor b = itensor_from_tensor(bias, /*from_const_data_ptr*/true);
    ideep::convolution_forward::compute_v3(
        x,
        w,
@ -704,9 +704,9 @@ Tensor _mkldnn_convolution_transpose(
  auto output_sizes = conv_input_size(input.sizes(), weight_IOHW_sizes, padding_expanded, output_padding_expanded, stride_expanded, dilation_expanded, groups);
  auto output = at::empty({0}, input.options());

-  const ideep::tensor x = itensor_from_tensor(input);
+  const ideep::tensor x = itensor_from_tensor(input, /*from_const_data_ptr*/true);

-  ideep::tensor w = itensor_from_tensor(weight);
+  ideep::tensor w = itensor_from_tensor(weight, /*from_const_data_ptr*/true);
  if (!weight.is_mkldnn()) {
    // mkldnn transposed convolution has weight in logical order of OIHW or OIDHW,
    // while PyTorch has IOHW or IODHW, `._tranpose()` switches strides (no memory copy).
@ -720,7 +720,7 @@ Tensor _mkldnn_convolution_transpose(
  }

  if (bias.defined()) {
-    const ideep::tensor b = itensor_from_tensor(bias);
+    const ideep::tensor b = itensor_from_tensor(bias, /*from_const_data_ptr*/true);
    ideep::convolution_transpose_forward::compute_v3(
        x,
        w,
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -3268,6 +3268,8 @@
  autogen: native_layer_norm_backward.out
  tags: core

+- func: rms_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor
+
 - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
  variants: function, method
  dispatch:
--- a/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp
+++ b/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp
@ -48,6 +48,8 @@ std::tuple<Tensor, Tensor> fake_quantize_per_channel_affine_cachemask(
    int64_t axis,
    int64_t quant_min,
    int64_t quant_max) {
+  TORCH_CHECK(scale.scalar_type() == ScalarType::Float,
+              "Scale must be Float, found ", scale.scalar_type());
  TORCH_CHECK(zero_point.scalar_type() == ScalarType::Int || zero_point.scalar_type() == ScalarType::Float || zero_point.scalar_type() == ScalarType::Half,
              "Zero-point must be Int32, Float or Half, found ", zero_point.scalar_type());
  TORCH_CHECK(scale.dim() == 1, "scale should be a 1-D tensor");
--- a/Show More
+++ b/Show More