Fix remaining -m<module> patterns in Python code and scripts for consistency

Co-authored-by: malfet <2453524+malfet@users.noreply.github.com>
Replace python3 -mpip and python -mpip with python3 -m pip and python -m pip for better readability
2025-10-21 13:44:15 +08:00 · 2025-10-17 14:56:15 +00:00 · 2025-10-17 14:52:55 +00:00 · 2025-10-17 14:45:52 +00:00 · 2025-10-17 14:42:14 +00:00 · 2025-10-17 13:39:36 +00:00
61 changed files with 1636 additions and 263 deletions
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -20,7 +20,7 @@ ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH

 # cmake-3.18.4 from pip
 RUN yum install -y python3-pip && \
-    python3 -mpip install cmake==3.18.4 && \
+    python3 -m pip install cmake==3.18.4 && \
    ln -s /usr/local/bin/cmake /usr/bin/cmake3
 RUN rm -rf /usr/local/cuda-*

--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -25,7 +25,7 @@ function install_torchbench() {
  python install.py --continue_on_fail

  echo "Print all dependencies after TorchBench is installed"
-  python -mpip freeze
+  python -m pip freeze
  popd

  chown -R jenkins torchbench
--- a/.ci/docker/common/install_mkl.sh
+++ b/.ci/docker/common/install_mkl.sh
@ -8,8 +8,8 @@ MKLROOT=/opt/intel
 mkdir -p ${MKLROOT}
 pushd /tmp

-python3 -mpip install wheel
-python3 -mpip download -d . mkl-static==${MKL_VERSION}
+python3 -m pip install wheel
+python3 -m pip download -d . mkl-static==${MKL_VERSION}
 python3 -m wheel unpack mkl_static-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl
 python3 -m wheel unpack mkl_include-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl
 mv mkl_static-${MKL_VERSION}/mkl_static-${MKL_VERSION}.data/data/lib ${MKLROOT}
--- a/.ci/docker/common/install_python.sh
+++ b/.ci/docker/common/install_python.sh
@ -11,5 +11,5 @@ ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
 python -m venv /var/lib/jenkins/ci_env
 source /var/lib/jenkins/ci_env/bin/activate

-python -mpip install --upgrade pip
-python -mpip install -r /opt/requirements-ci.txt
+python -m pip install --upgrade pip
+python -m pip install -r /opt/requirements-ci.txt
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -14,7 +14,7 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/op

 # cmake-3.18.4 from pip
 RUN yum install -y python3-pip && \
-    python3 -mpip install cmake==3.18.4 && \
+    python3 -m pip install cmake==3.18.4 && \
    ln -s /usr/local/bin/cmake /usr/bin/cmake3

 FROM base as openssl
@ -135,7 +135,7 @@ RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh

 # cmake-3.18.4 from pip; force in case cmake3 already exists
 RUN yum install -y python3-pip && \
-    python3 -mpip install cmake==3.18.4 && \
+    python3 -m pip install cmake==3.18.4 && \
    ln -sf /usr/local/bin/cmake /usr/bin/cmake3

 FROM cpu_final as cuda_final
@ -157,7 +157,7 @@ ENV ROCM_PATH /opt/rocm
 # cmake-3.28.4 from pip to get enable_language(HIP)
 # and avoid 3.21.0 cmake+ninja issues with ninja inserting "-Wl,--no-as-needed" in LINK_FLAGS for static linker
 RUN python3 -m pip install --upgrade pip && \
-    python3 -mpip install cmake==3.28.4
+    python3 -m pip install cmake==3.28.4
 # replace the libdrm in /opt/amdgpu with custom amdgpu.ids lookup path
 ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
@ -174,7 +174,7 @@ FROM cpu_final as xpu_final
 ENV XPU_DRIVER_TYPE ROLLING
 # cmake-3.28.4 from pip
 RUN python3 -m pip install --upgrade pip && \
-    python3 -mpip install cmake==3.28.4
+    python3 -m pip install cmake==3.28.4
 ADD ./common/install_xpu.sh install_xpu.sh
 ENV XPU_VERSION 2025.2
 RUN bash ./install_xpu.sh && rm install_xpu.sh
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -113,7 +113,7 @@ RUN dnf install -y \
 RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio

 # cmake-3.28.0 from pip for onnxruntime
-RUN python3 -mpip install cmake==3.28.0
+RUN python3 -m pip install cmake==3.28.0

 ADD ./common/patch_libstdc.sh patch_libstdc.sh
 RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -288,7 +288,7 @@ else
    # or building non-XLA tests.
    if [[ "$BUILD_ENVIRONMENT" != *rocm*  && "$BUILD_ENVIRONMENT" != *xla* && "$BUILD_ENVIRONMENT" != *riscv64* ]]; then
      # Install numpy-2.0.2 for builds which are backward compatible with 1.X
-      python -mpip install numpy==2.0.2
+      python -m pip install numpy==2.0.2

      WERROR=1 python setup.py clean

--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -67,13 +67,13 @@ function pip_install_whl() {
    # Loop through each path and install individually
    for path in "${paths[@]}"; do
      echo "Installing $path"
-      python3 -mpip install --no-index --no-deps "$path"
+      python3 -m pip install --no-index --no-deps "$path"
    done
  else
    # Loop through each argument and install individually
    for path in "${args[@]}"; do
      echo "Installing $path"
-      python3 -mpip install --no-index --no-deps "$path"
+      python3 -m pip install --no-index --no-deps "$path"
    done
  fi
 }
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -182,7 +182,7 @@ checkout_install_torchbench() {
  pip uninstall -y torchao

  echo "Print all dependencies after TorchBench is installed"
-  python -mpip freeze
+  python -m pip freeze
 }

 torchbench_setup_macos() {
@ -211,7 +211,7 @@ torchbench_setup_macos() {
 }

 pip_benchmark_deps() {
-  python -mpip install --no-input requests cython scikit-learn six
+  python -m pip install --no-input requests cython scikit-learn six
 }


--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -1434,7 +1434,7 @@ EOF
  # shellcheck source=./common-build.sh
  source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
  python -m build --wheel --no-isolation -C--build-option=--bdist-dir="base_bdist_tmp" --outdir "base_dist"
-  python -mpip install base_dist/*.whl
+  python -m pip install base_dist/*.whl
  echo "::endgroup::"

  pushd test/forward_backward_compatibility
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -173,7 +173,7 @@ esac
 PINNED_PACKAGES=(
    "numpy${NUMPY_PINNED_VERSION}"
 )
-python -mvenv ~/${desired_python}-build
+python -m venv ~/${desired_python}-build
 source ~/${desired_python}-build/bin/activate
 retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
 retry brew install libomp
--- a/.github/scripts/prepare_vllm_wheels.sh
+++ b/.github/scripts/prepare_vllm_wheels.sh
@ -24,7 +24,7 @@ change_wheel_version() {
  local t_version=$4

  # Extract the wheel
-  ${PYTHON_EXECUTABLE} -mwheel unpack $wheel
+  ${PYTHON_EXECUTABLE} -m wheel unpack $wheel

  mv "${package}-${f_version}" "${package}-${t_version}"
  # Change the version from f_version to t_version in the dist-info dir
@ -47,7 +47,7 @@ change_wheel_version() {
  popd

  # Repack the wheel
-  ${PYTHON_EXECUTABLE} -mwheel pack "${package}-${t_version}"
+  ${PYTHON_EXECUTABLE} -m wheel pack "${package}-${t_version}"

  # Clean up
  rm -rf "${package}-${t_version}"
@ -85,7 +85,7 @@ repackage_wheel() {
 }

 # Require to re-package the wheel
-${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1
+${PYTHON_EXECUTABLE} -m pip install wheel==0.45.1

 pushd externals/vllm/wheels
 for package in xformers flashinfer-python vllm; do
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -211,7 +211,7 @@ jobs:
            $tool --version
          done

-          python3 -mpip install --no-index --no-deps dist/*.whl
+          python3 -m pip install --no-index --no-deps dist/*.whl

          set +e
          pushd "${RUNNER_TEMP}"
@ -222,7 +222,7 @@ jobs:
          popd

          if [ "${RC}" -ne 0 ]; then
-            python3 -mpip install --ignore-installed -r "${PIP_REQUIREMENTS_FILE}"
+            python3 -m pip install --ignore-installed -r "${PIP_REQUIREMENTS_FILE}"
          fi
          set -e

--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -204,7 +204,7 @@ jobs:
        run: |
          pushd "${PYTORCH_FINAL_PACKAGE_DIR}"
          # shellcheck disable=SC2046,SC2102
-          python3 -mpip install $(echo *.whl)[opt-einsum,optree] optree==0.13.0
+          python3 -m pip install $(echo *.whl)[opt-einsum,optree] optree==0.13.0
          popd

          .ci/pytorch/win-test.sh
--- a/.github/workflows/build-vllm-wheel.yml
+++ b/.github/workflows/build-vllm-wheel.yml
@ -126,13 +126,13 @@ jobs:
            "${MANYLINUX_IMAGE}"
          )

-          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \
+          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip install \
            --pre torch torchvision torchaudio \
            --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"

          # I wonder if there is a command to both download and install the wheels
          # in one go
-          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip download \
+          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip download \
            --pre torch torchvision torchaudio \
            --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"

--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -106,7 +106,7 @@ jobs:
          SMOKE_TEST_PARAMS=""

          # shellcheck disable=SC2086
-          python -mvenv test_venv
+          python -m venv test_venv
          source test_venv/bin/activate
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -216,7 +216,7 @@ jobs:
          SMOKE_TEST_PARAMS=""

          # shellcheck disable=SC2086
-          python -mvenv test_venv
+          python -m venv test_venv
          source test_venv/bin/activate
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -326,7 +326,7 @@ jobs:
          SMOKE_TEST_PARAMS=""

          # shellcheck disable=SC2086
-          python -mvenv test_venv
+          python -m venv test_venv
          source test_venv/bin/activate
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -436,7 +436,7 @@ jobs:
          SMOKE_TEST_PARAMS=""

          # shellcheck disable=SC2086
-          python -mvenv test_venv
+          python -m venv test_venv
          source test_venv/bin/activate
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -546,7 +546,7 @@ jobs:
          SMOKE_TEST_PARAMS=""

          # shellcheck disable=SC2086
-          python -mvenv test_venv
+          python -m venv test_venv
          source test_venv/bin/activate
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -656,7 +656,7 @@ jobs:
          SMOKE_TEST_PARAMS=""

          # shellcheck disable=SC2086
-          python -mvenv test_venv
+          python -m venv test_venv
          source test_venv/bin/activate
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -766,7 +766,7 @@ jobs:
          SMOKE_TEST_PARAMS=""

          # shellcheck disable=SC2086
-          python -mvenv test_venv
+          python -m venv test_venv
          source test_venv/bin/activate
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@ -52,3 +52,27 @@ jobs:
      docker-image: ${{ needs.x86-opbenchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.x86-opbenchmark-build.outputs.test-matrix }}
    secrets: inherit
+
+  aarch64-opbenchmark-build:
+    if: github.repository_owner == 'pytorch'
+    name: aarch64-opbenchmark-build
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-aarch64-py3.10
+      runner: linux.arm64.m7g.4xlarge
+      docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
+      test-matrix: |
+        { include: [
+          { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.arm64.m8g.4xlarge" },
+        ]}
+    secrets: inherit
+
+  aarch64-opbenchmark-test:
+    name: aarch64-opbenchmark-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs: aarch64-opbenchmark-build
+    with:
+      build-environment: linux-jammy-aarch64-py3.10
+      docker-image: ${{ needs.aarch64-opbenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.aarch64-opbenchmark-build.outputs.test-matrix }}
+    secrets: inherit
--- a/2
+++ b/2
@ -39,7 +39,7 @@ RUN chmod +x ~/miniconda.sh && \
    bash ~/miniconda.sh -b -p /opt/conda && \
    rm ~/miniconda.sh && \
    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
-    /opt/conda/bin/python -mpip install -r requirements.txt && \
+    /opt/conda/bin/python -m pip install -r requirements.txt && \
    /opt/conda/bin/conda clean -ya

 FROM dev-base as submodule-update
--- a/aten/src/ATen/core/PhiloxRNGEngine.h
+++ b/aten/src/ATen/core/PhiloxRNGEngine.h
@ -229,10 +229,10 @@ private:
  }


-  static const uint32_t kPhilox10A = 0x9E3779B9;
-  static const uint32_t kPhilox10B = 0xBB67AE85;
-  static const uint32_t kPhiloxSA = 0xD2511F53;
-  static const uint32_t kPhiloxSB = 0xCD9E8D57;
+  static constexpr uint32_t kPhilox10A = 0x9E3779B9;
+  static constexpr uint32_t kPhilox10B = 0xBB67AE85;
+  static constexpr uint32_t kPhiloxSA = 0xD2511F53;
+  static constexpr uint32_t kPhiloxSB = 0xCD9E8D57;
 };

 typedef philox_engine Philox4_32;
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -16,6 +16,8 @@
 #include <c10/util/irange.h>
 #include <c10/core/ScalarType.h>

+#include <ATen/cuda/detail/BLASConstants.h>
+
 #ifdef USE_ROCM
 #include <c10/cuda/CUDAStream.h>
 #include <hipblaslt/hipblaslt-ext.hpp>
@ -1954,13 +1956,15 @@ void scaled_gemm(
    const void *result_scale_ptr,
    int64_t result_ld,
    ScalarType result_dtype,
-    bool use_fast_accum) {
+    bool use_fast_accum,
+    const std::optional<Tensor>& alpha) {
  // Note: see `cublasCommonArgs` for various non-intuitive manupulations
  // of input arguments to this function.
  const auto computeType = CUBLAS_COMPUTE_32F;
  const auto scaleType = CUDA_R_32F;
-  const float alpha_val = 1.0;
-  const float beta_val = 0.0;
+  // Note: alpha_val may change later depending on user-passed argument
+  float alpha_val = 1.0;
+  float beta_val = 0.0;
  CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, _cublasOpFromChar(transa));
  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
@ -2031,6 +2035,33 @@ void scaled_gemm(
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, CUBLASLT_EPILOGUE_BIAS);
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
  }
+
+  // Handle user-passed alpha
+  float *alpha_ptr = &alpha_val;
+  float *beta_ptr = &beta_val;
+
+  if (alpha.has_value()) {
+    auto& a = alpha.value();
+
+    // if device-tensor
+    if (a.is_cuda()) {
+      // NOTE: there are lifetime requirements on device-side pointers for alpha/beta -- the value must be
+      //       valid & correct until the cublas call finishes (not is scheduled like host-side values). Thus
+      //       we need to use allocations for alpha/beta that have some guarantees on lifetime - a statically
+      //       managed 4B buffer for alpha that we'll copy the passed alpha value into, and constant memory
+      //       for beta respectively.
+      float *user_alpha_ptr = at::cuda::detail::get_user_alpha_ptr();
+      at::Tensor user_alpha = at::from_blob(user_alpha_ptr, {1}, TensorOptions().device(kCUDA).dtype(kFloat));
+      user_alpha.copy_(a);
+      // Tell cublasLt we're using device-side pointers for alpha/beta
+      auto pointer_mode = CUBLASLT_POINTER_MODE_DEVICE;
+      computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_POINTER_MODE, pointer_mode);
+      alpha_ptr = user_alpha.data_ptr<float>();
+      beta_ptr = at::cuda::detail::get_cublas_device_zero();
+    } else {
+      alpha_val = a.item<float>();
+    }
+  }
    // For other data types, use the get_scale_mode function based on scaling type
    // The SCALE_MODE attrs only exist in cuBLAS 12.8+/ROCm 7.0 or in recent hipblaslt,
    // but we must invoke get_scale_mode anyways to trigger the version checks.
@ -2048,6 +2079,7 @@ void scaled_gemm(
  cublasLtMatmulHeuristicResult_t heuristicResult = {};
  int returnedResult = 0;
  cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
+
  TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
      ltHandle,
      computeDesc.descriptor(),
@ -2088,10 +2120,10 @@ void scaled_gemm(
        auto is_valid_status = hipblaslt_ext::matmulIsAlgoSupported(
                ltHandle,
                computeDesc.descriptor(),
-                &alpha_val,
+                alpha_ptr,
                Adesc.descriptor(),
                Bdesc.descriptor(),
-                &beta_val,
+                beta_ptr,
                Cdesc.descriptor(),
                Ddesc.descriptor(),
                all_algos[i].algo,
@ -2110,17 +2142,14 @@ void scaled_gemm(
  cublasStatus_t cublasStatus = cublasLtMatmul(
      ltHandle,
      computeDesc.descriptor(),
-      &alpha_val,
+      alpha_ptr,
      mat1_ptr,
      Adesc.descriptor(),
      mat2_ptr,
      Bdesc.descriptor(),
-      &beta_val,
-#ifdef USE_ROCM
+      beta_ptr,
+      // NOTE: always use result_ptr here, because cuBLASLt w/device beta=0 can't handle nullptr either
      result_ptr, // unused, since beta_val is 0, but hipblaslt can't handle nullptr
-#else
-      nullptr,
-#endif // ifdef USE_ROCM
      Cdesc.descriptor(),
      result_ptr,
      Ddesc.descriptor(),
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@ -161,7 +161,8 @@ void scaled_gemm(
    const void* result_scale_ptr,
    int64_t result_ld,
    ScalarType result_dtype,
-    bool use_fast_accum);
+    bool use_fast_accum,
+    const std::optional<Tensor>& alpha);

 #define CUDABLAS_BGEMM_ARGTYPES(Dtype)  CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype)

--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@ -325,9 +325,9 @@ uint64_t CUDAGeneratorImpl::seed() {
 */
 c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
  // The RNG state comprises the seed, and an offset used for Philox.
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(int64_t);
-  static const size_t total_size = seed_size + offset_size;
+  constexpr size_t seed_size = sizeof(uint64_t);
+  constexpr size_t offset_size = sizeof(int64_t);
+  constexpr size_t total_size = seed_size + offset_size;

  auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
  auto rng_state = state_tensor.data_ptr<uint8_t>();
@ -346,9 +346,9 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
 * and size of the internal state.
 */
 void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(int64_t);
-  static const size_t total_size = seed_size + offset_size;
+  constexpr size_t seed_size = sizeof(uint64_t);
+  constexpr size_t offset_size = sizeof(int64_t);
+  constexpr size_t total_size = seed_size + offset_size;

  detail::check_rng_state(new_state);

--- a/aten/src/ATen/cuda/detail/BLASConstants.cu
+++ b/aten/src/ATen/cuda/detail/BLASConstants.cu
@ -0,0 +1,54 @@
+#include <ATen/Functions.h>
+#include <ATen/Tensor.h>
+#include <ATen/cuda/Exceptions.h>
+
+#include <mutex>
+
+namespace at {
+namespace cuda {
+namespace detail {
+
+__device__ __constant__ float cublas_one_device;
+__device__ __constant__ float cublas_zero_device;
+
+float *get_cublas_device_one() {
+  static c10::once_flag init_flag;
+
+  c10::call_once(init_flag, []() {
+    const float one = 1.f;
+    AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_one_device, &one, sizeof(float)));
+  });
+
+  float *ptr;
+  AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_one_device));
+  return ptr;
+}
+
+float *get_cublas_device_zero() {
+  static c10::once_flag init_flag;
+
+  c10::call_once(init_flag, []() {
+    const float zero = 0.f;
+    AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_zero_device, &zero, sizeof(float)));
+  });
+
+  float *ptr;
+  AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_zero_device));
+  return ptr;
+}
+
+float *get_user_alpha_ptr() {
+  static float *alpha_ptr;
+
+  static c10::once_flag init_flag;
+
+  c10::call_once(init_flag, []() {
+    AT_CUDA_CHECK(cudaMalloc(&alpha_ptr, sizeof(float)));
+  });
+
+  return alpha_ptr;
+}
+
+} // namespace detail
+} // namespace cuda
+} // namespace at
--- a/aten/src/ATen/cuda/detail/BLASConstants.h
+++ b/aten/src/ATen/cuda/detail/BLASConstants.h
@ -0,0 +1,11 @@
+#pragma once
+
+#include <ATen/core/TensorBase.h>
+
+namespace at::cuda::detail {
+
+float *get_cublas_device_one();
+float *get_cublas_device_zero();
+float *get_user_alpha_ptr();
+
+} // namespace at::cuda::detail
--- a/aten/src/ATen/cuda/tunable/TunableGemm.h
+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
@ -109,7 +109,8 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
          params->c_scale_ptr,
          params->ldc,
          params->c_dtype,
-          params->use_fast_accum);
+          params->use_fast_accum,
+          std::nullopt /* alpha */);
      return OK;
    }
 };
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@ -240,8 +240,8 @@ TORCH_META_FUNC(gelu_backward) (

 namespace at::native {

-static const double SELU_ALPHA = 1.6732632423543772848170429916717;
-static const double SELU_SCALE = 1.0507009873554804934193349852946;
+static constexpr double SELU_ALPHA = 1.6732632423543772848170429916717;
+static constexpr double SELU_SCALE = 1.0507009873554804934193349852946;

 DEFINE_DISPATCH(elu_stub);
 DEFINE_DISPATCH(elu_backward_stub);
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@ -286,7 +286,7 @@ template void scal_fast_path<scalar_t>(int *n, scalar_t *a, scalar_t *x, int *in
 #if AT_BUILD_WITH_BLAS()
 template <>
 bool scal_use_fast_path<double>(int64_t n, int64_t incx) {
-  auto intmax = std::numeric_limits<int>::max();
+  auto constexpr intmax = std::numeric_limits<int>::max();
  return n <= intmax && incx <= intmax;
 }

@ -315,7 +315,7 @@ bool gemv_use_fast_path<float>(
    int64_t incx,
    [[maybe_unused]] float beta,
    int64_t incy) {
-  auto intmax = std::numeric_limits<int>::max();
+  auto constexpr intmax = std::numeric_limits<int>::max();
  return (m <= intmax) && (n <= intmax) && (lda <= intmax) &&
         (incx > 0) && (incx <= intmax) && (incy > 0) && (incy <= intmax);
 }
--- a/aten/src/ATen/native/Distributions.h
+++ b/aten/src/ATen/native/Distributions.h
@ -1,5 +1,6 @@
 #pragma once

+#include <array>
 #include <ATen/native/Math.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/MathConstants.h>
@ -127,7 +128,7 @@ C10_DEVICE scalar_t sample_gamma(scalar_t alpha, BaseSampler<accscalar_t, unifor

 template<typename scalar_t>
 C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
-  const static scalar_t kTailValues[] = {
+  constexpr static scalar_t kTailValues[] = {
    0.0810614667953272,
    0.0413406959554092,
    0.0276779256849983,
@ -139,7 +140,7 @@ C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
    0.00925546218271273,
    0.00833056343336287
  };
-  if (k <= 9) {
+  if (k < std::size(kTailValues)) {
    return kTailValues[static_cast<size_t>(k)];
  }
  scalar_t kp1sq = (k + 1) * (k + 1);
--- a/aten/src/ATen/native/Math.h
+++ b/aten/src/ATen/native/Math.h
@ -581,7 +581,7 @@ scalar_t ratevl(scalar_t x, const scalar_t num[], int64_t M,
 template <typename scalar_t>
 static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
  // lanczos approximation
-  static const scalar_t lanczos_sum_expg_scaled_num[13] = {
+  static constexpr scalar_t lanczos_sum_expg_scaled_num[13] = {
    0.006061842346248906525783753964555936883222,
    0.5098416655656676188125178644804694509993,
    19.51992788247617482847860966235652136208,
@ -596,7 +596,7 @@ static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
    103794043.1163445451906271053616070238554,
    56906521.91347156388090791033559122686859
  };
-  static const scalar_t lanczos_sum_expg_scaled_denom[13] = {
+  static constexpr scalar_t lanczos_sum_expg_scaled_denom[13] = {
    1.,
    66.,
    1925.,
@ -712,7 +712,7 @@ static scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
 template <typename scalar_t>
 static scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) {
  // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
-  static const scalar_t d[25][25] =
+  static constexpr scalar_t d[25][25] =
    {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2,
      1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4,
      3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6,
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@ -62,7 +62,7 @@
 #include <utility>
 #include <vector>

-static const int MIOPEN_DIM_MAX = 5;
+static constexpr int MIOPEN_DIM_MAX = 5;

 namespace at::meta {

--- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
@ -1038,7 +1038,7 @@ struct HelperInterpNearest : public HelperInterpBase {
  // We keep this structure for BC and consider as deprecated.
  // See HelperInterpNearestExact as replacement

-  static const int interp_size = 1;
+  static constexpr int interp_size = 1;

  static inline void init_indices_weights(
    at::ScalarType output_type,
@ -1155,7 +1155,7 @@ struct HelperInterpNearestExact : public HelperInterpNearest {

 struct HelperInterpLinear : public HelperInterpBase {

-  static const int interp_size = 2;
+  static constexpr int interp_size = 2;

  // Compute indices and weights for each interpolated dimension
  // indices_weights = {
@ -1275,7 +1275,7 @@ struct HelperInterpLinear : public HelperInterpBase {

 struct HelperInterpCubic : public HelperInterpBase {

-  static const int interp_size = 4;
+  static constexpr int interp_size = 4;

  // Compute indices and weights for each interpolated dimension
  // indices_weights = {
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -1359,7 +1359,8 @@ _scaled_gemm(
          const ScalingType scaling_choice_a, const ScalingType scaling_choice_b,
          const std::optional<Tensor>& bias,
          const bool use_fast_accum,
-          Tensor& out) {
+          Tensor& out,
+          const std::optional<Tensor>& alpha = std::nullopt) {
  cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, std::nullopt, scaling_choice_a, scaling_choice_b);
  const auto out_dtype_ = args.result->scalar_type();
  TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
@ -1410,7 +1411,8 @@ _scaled_gemm(
          args.scale_result_ptr,
          args.result_ld,
          out_dtype_,
-          use_fast_accum);
+          use_fast_accum,
+          alpha);
      return out;
  }
 }
--- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
@ -249,7 +249,7 @@ __global__ void max_pool_forward_nhwc(
 }


-static const int BLOCK_THREADS = 256;
+static constexpr int BLOCK_THREADS = 256;

 template <typename scalar_t, typename accscalar_t>
 #if defined (USE_ROCM)
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@ -36,9 +36,9 @@ namespace at::native {
 namespace {

 #if defined(USE_ROCM)
-static const int BLOCKDIMY = 16;
+static constexpr int BLOCKDIMY = 16;
 #else
-static const int BLOCKDIMY = 32;
+static constexpr int BLOCKDIMY = 32;
 #endif

 template
--- a/aten/src/ATen/native/cuda/IGammaKernel.cu
+++ b/aten/src/ATen/native/cuda/IGammaKernel.cu
@ -82,7 +82,7 @@ __host__ __device__ scalar_t lanczos_sum_expg_scaled(scalar_t x) {
  // lanczos approximation
  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;

-  static const accscalar_t lanczos_sum_expg_scaled_num[13] = {
+  constexpr accscalar_t lanczos_sum_expg_scaled_num[13] = {
    0.006061842346248906525783753964555936883222,
    0.5098416655656676188125178644804694509993,
    19.51992788247617482847860966235652136208,
@ -97,7 +97,7 @@ __host__ __device__ scalar_t lanczos_sum_expg_scaled(scalar_t x) {
    103794043.1163445451906271053616070238554,
    56906521.91347156388090791033559122686859
  };
-  static const accscalar_t lanczos_sum_expg_scaled_denom[13] = {
+  constexpr accscalar_t lanczos_sum_expg_scaled_denom[13] = {
    1.,
    66.,
    1925.,
@ -126,10 +126,10 @@ __host__ __device__ scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {

  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
  accscalar_t ax, fac, res, num, numfac;
-  static const accscalar_t MAXLOG = std::is_same_v<accscalar_t,double> ?
+  constexpr accscalar_t MAXLOG = std::is_same_v<accscalar_t,double> ?
    7.09782712893383996843E2 : 88.72283905206835;
-  static const accscalar_t EXP1 = 2.718281828459045;
-  static const accscalar_t lanczos_g = 6.024680040776729583740234375;
+  constexpr accscalar_t EXP1 = 2.718281828459045;
+  constexpr accscalar_t lanczos_g = 6.024680040776729583740234375;

  if (::fabs(a - x) > 0.4 * ::fabs(a)) {
    ax = a * ::log(x) - x - ::lgamma(a);
@ -158,9 +158,9 @@ __host__ __device__ scalar_t _igam_helper_series(scalar_t a, scalar_t x) {
  // Compute igam using DLMF 8.11.4. [igam1]

  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
-  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
    1.11022302462515654042E-16 : 5.9604644775390625E-8;
-  static const int MAXITER = 2000;
+  constexpr int MAXITER = 2000;

  int i;
  accscalar_t ans, ax, c, r;
@ -196,8 +196,8 @@ __host__ __device__ scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
  accscalar_t fac = 1;
  accscalar_t sum = 0;
  accscalar_t term, logx;
-  static const int MAXITER = 2000;
-  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  constexpr int MAXITER = 2000;
+  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
    1.11022302462515654042E-16 : 5.9604644775390625E-8;

  for (n = 1; n < MAXITER; n++) {
@ -219,7 +219,7 @@ __host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t
  // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]

  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
-  static const accscalar_t d[25][25] =
+  constexpr accscalar_t d[25][25] =
    {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2, 1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4, 3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6, 8.296711340953086e-7, -1.7665952736826079e-7, 6.7078535434014986e-9, 1.0261809784240308e-8, -4.3820360184533532e-9, 9.1476995822367902e-10, -2.551419399494625e-11, -5.8307721325504251e-11, 2.4361948020667416e-11, -5.0276692801141756e-12, 1.1004392031956135e-13, 3.3717632624009854e-13, -1.3923887224181621e-13, 2.8534893807047443e-14, -5.1391118342425726e-16, -1.9752288294349443e-15},
    {-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3, -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7, -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6, 4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8, 1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9, 4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14, 7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13, -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14, -4.13125571381061e-15},
    {4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4, 2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5, -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6, -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10, -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9, 9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11, 1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12, 4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17, 8.8592218725911273e-15},
@ -248,7 +248,7 @@ __host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t

  int k, n, sgn;
  int maxpow = 0;
-  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
    1.11022302462515654042E-16 : 5.9604644775390625E-8;
  accscalar_t lambda = x / a;
  accscalar_t sigma = (x - a) / a;
@ -314,12 +314,12 @@ __host__ __device__ scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar
  int i;
  accscalar_t ans, ax, c, yc, r, t, y, z;
  accscalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
-  static const int MAXITER = 2000;
-  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  constexpr int MAXITER = 2000;
+  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
    1.11022302462515654042E-16 : 5.9604644775390625E-8;
-  static const accscalar_t BIG = std::is_same_v<accscalar_t,double> ?
+  constexpr accscalar_t BIG = std::is_same_v<accscalar_t,double> ?
    4.503599627370496e15 : 16777216.;
-  static const accscalar_t BIGINV = std::is_same_v<accscalar_t,double> ?
+  constexpr accscalar_t BIGINV = std::is_same_v<accscalar_t,double> ?
    2.22044604925031308085e-16 : 5.9604644775390625E-8;

  ax = _igam_helper_fac(a, x);
@ -385,10 +385,10 @@ __noinline__ __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) {
  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
  accscalar_t absxma_a;

-  static const accscalar_t SMALL = 20.0;
-  static const accscalar_t LARGE = 200.0;
-  static const accscalar_t SMALLRATIO = 0.3;
-  static const accscalar_t LARGERATIO = 4.5;
+  constexpr accscalar_t SMALL = 20.0;
+  constexpr accscalar_t LARGE = 200.0;
+  constexpr accscalar_t SMALLRATIO = 0.3;
+  constexpr accscalar_t LARGERATIO = 4.5;

  if ((x < 0) || (a < 0)) {
    // out of defined-region of the function
@ -467,10 +467,10 @@ __noinline__ __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) {

  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
  accscalar_t absxma_a;
-  static const accscalar_t SMALL = 20.0;
-  static const accscalar_t LARGE = 200.0;
-  static const accscalar_t SMALLRATIO = 0.3;
-  static const accscalar_t LARGERATIO = 4.5;
+  constexpr accscalar_t SMALL = 20.0;
+  constexpr accscalar_t LARGE = 200.0;
+  constexpr accscalar_t SMALLRATIO = 0.3;
+  constexpr accscalar_t LARGERATIO = 4.5;

  // boundary values following SciPy
  if ((x < 0) || (a < 0)) {
--- a/aten/src/ATen/native/cuda/Math.cuh
+++ b/aten/src/ATen/native/cuda/Math.cuh
@ -231,7 +231,7 @@ const auto lcm_string = jiterator_stringify(
 const auto digamma_string = jiterator_stringify(
  template <typename T>
  T digamma(T x) {
-    static const double PI_f64 = 3.14159265358979323846;
+    static constexpr double PI_f64 = 3.14159265358979323846;

    // Short-circuits if x is +/- 0 and returns -/+ ∞ per the C++ standard
    if (x == 0) {
@ -3072,9 +3072,9 @@ template <typename scalar_t>
 static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) {
  // [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma
  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
-  static const double PI_f64 = 3.14159265358979323846;
-  const accscalar_t PSI_10 = 2.25175258906672110764;
-  const accscalar_t A[] = {
+  static constexpr double PI_f64 = 3.14159265358979323846;
+  constexpr accscalar_t PSI_10 = 2.25175258906672110764;
+  constexpr accscalar_t A[] = {
      8.33333333333333333333E-2,
      -2.10927960927960927961E-2,
      7.57575757575757575758E-3,
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@ -1097,11 +1097,7 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
  // threads with different threadIdx.x are independent and will produce results for different outputs.
  // In such case, values in each loaded vector always correspond to different outputs.
  if (fastest_moving_stride == sizeof(scalar_t)) {
-#ifdef USE_ROCM
    if (reduction_on_fastest_striding_dimension && dim0 >= 128 && iter.num_reduce_dims() == 1) {
-#else
-    if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1 && vt0 >= input_vec_size) {
-#endif
      // Case 1: "vectorize along input"
      // Note that if vt0 < ReduceConfig::vec_size, then this means the register pressure could be high, in such case,
      // we should avoid vectorization.
--- a/aten/src/ATen/native/cuda/ReduceMomentKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceMomentKernel.cu
@ -39,9 +39,14 @@ static void std_var_kernel_cuda(TensorIterator& iter, double correction, bool ta
 template <typename scalar_t, typename acc_t=scalar_t, typename out_t=scalar_t>
 void mean_kernel_impl(TensorIterator& iter) {
  //  returns acc_t for all non-complex dtypes and returns T for c10::complex<T>
+  constexpr bool is_16_bits = sizeof(scalar_t) == 2;
  using factor_t = typename c10::scalar_value_type<acc_t>::type;
  factor_t factor = static_cast<factor_t>(iter.num_output_elements()) / iter.numel();
-  gpu_reduce_kernel<scalar_t, out_t>(iter, MeanOps<scalar_t, acc_t, factor_t, out_t> {factor});
+  if constexpr (is_16_bits) {
+    gpu_reduce_kernel<scalar_t, out_t, /*vt0=*/4, /*input_vec_size=*/8>(iter, MeanOps<scalar_t, acc_t, factor_t, out_t> {factor});
+  } else {
+    gpu_reduce_kernel<scalar_t, out_t>(iter, MeanOps<scalar_t, acc_t, factor_t, out_t> {factor});
+  }
 }

 static void mean_kernel_cuda(TensorIterator& iter) {
--- a/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
@ -13,24 +13,19 @@ namespace at::native {
 template <typename scalar_t, typename acc_t = scalar_t, typename out_t = scalar_t>
 struct sum_functor {
  void operator()(TensorIterator& iter) {
-#ifdef USE_ROCM
-    // Half and BFloat16 can be packed in groups of up to 8 elements and
-    // can use *_DWORDX4 instructions to achieve that.
-    const bool is_16_bits =
-      ( (std::is_same<at::Half, scalar_t>::value) ||
-        (std::is_same<at::BFloat16, scalar_t>::value) );
-    if (is_16_bits) {
+    const auto sum_combine = [] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
+      return a + b;
+    };
+    constexpr bool is_16_bits = sizeof(scalar_t) == 2;
+    if constexpr (is_16_bits) {
      gpu_reduce_kernel<scalar_t, out_t, /*vt0=*/4, /*input_vec_size=*/8>(
-        iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
-          return a + b;
-        }));
-      return;
+        iter, func_wrapper<out_t>(sum_combine)
+      );
+    } else {
+      gpu_reduce_kernel<scalar_t, out_t>(
+        iter, func_wrapper<out_t>(sum_combine)
+      );
    }
-#endif
-    gpu_reduce_kernel<scalar_t, out_t>(
-        iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
-          return a + b;
-        }));
  }
 };

--- a/aten/src/ATen/native/cuda/UpSample.cuh
+++ b/aten/src/ATen/native/cuda/UpSample.cuh
@ -277,7 +277,7 @@ struct BilinearFilterFunctor {
    return 0;
  }

-  static const int size = 2;
+  static constexpr int size = 2;
 };

 // taken from
@ -301,7 +301,7 @@ struct BicubicFilterFunctor {
    return 0;
  }

-  static const int size = 4;
+  static constexpr int size = 4;
 };

 template <typename accscalar_t>
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@ -141,7 +141,11 @@ WelfordDataLN cuWelfordOnlineSum(
  if constexpr (!rms_norm){
    U delta = val - curr_sum.mean;
    U new_count = curr_sum.count + 1.f;
+#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
+    U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
+#else
    U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
+#endif
    return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count};
  } else{
    return {0.f, curr_sum.sigma2 + val * val, 0};
@ -159,7 +163,11 @@ WelfordDataLN cuWelfordCombine(
    U count = dataA.count + dataB.count;
    U mean, sigma2;
    if (count > decltype(dataB.count){0}) {
+#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
+      auto coef = __builtin_amdgcn_rcpf(count);
+#else
      auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
+#endif
      auto nA = dataA.count * coef;
      auto nB = dataB.count * coef;
      mean = nA*dataA.mean + nB*dataB.mean;
--- a/aten/src/ATen/native/mkldnn/Matmul.cpp
+++ b/aten/src/ATen/native/mkldnn/Matmul.cpp
@ -416,7 +416,7 @@ static inline bool checksize(const Tensor& mat1, const Tensor& mat2){
  // else if dim = 3, mat1's size = (b * m * n), mat2's size = (b * n * k)
  // else called from aten::mv, mat1.size = (m * n), mat2.size = (n)
  // only m * n * b * k(if exist) are large enough we can get benefit from mkldnn optimized gemm kernel
-  static const int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
+  constexpr int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
  if (mat1.dim() == 1 && mat2.dim() == 1) {
    // aten::dot
    return mat1.size(0) > mkldnn_gemm_min_size;
--- a/aten/src/ATen/native/mps/kernels/Shape.metal
+++ b/aten/src/ATen/native/mps/kernels/Shape.metal
@ -16,7 +16,6 @@ kernel void cat(
  auto ndim = shared_params.ndim;
  auto cat_dim = shared_params.cat_dim;
  constant auto& output_strides = shared_params.output_strides;
-  constant auto& output_sizes = shared_params.output_sizes;

  auto cat_dim_offset = input_params.cat_dim_offset;
  auto input_element_offset = input_params.input_element_offset;
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@ -3551,7 +3551,7 @@ void dequantize_tensor_per_tensor_affine_cpu(

 #if defined(__ARM_NEON__) || defined(__aarch64__)

-const static int PARALLEL_THRESHOLD = 1 << 20;
+constexpr static int PARALLEL_THRESHOLD = 1 << 20;

 // Generic template defaults to naive quantize implementation
 template <typename T>
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@ -1388,7 +1388,7 @@ namespace at::native {
    TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() <= 1,
        "onednn int8 linear: act scale/zp size should be 1/<=1");
    static std::optional<at::Tensor> other = std::nullopt;
-    static const std::string_view binary_post_op = "none";
+    constexpr std::string_view binary_post_op = "none";
    int64_t act_zp = act_zero_point.numel() == 1 ? act_zero_point.item().toLong() : 0;
    return linear_int8_with_onednn_weight(
        act, act_scale.item().toDouble(), act_zp,
--- a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
@ -16,8 +16,8 @@ namespace {

 #ifdef USE_PYTORCH_QNNPACK

-const static float qnnpack_softmax_output_scale = 0x1.0p-8f;
-const static int qnnpack_softmax_output_zero_point = 0;
+constexpr static float qnnpack_softmax_output_scale = 0x1.0p-8f;
+constexpr static int qnnpack_softmax_output_zero_point = 0;

 bool is_qnnpack_compatible(
    const Tensor& qx,
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h
@ -110,9 +110,9 @@ class ApplyLogSumExp {
  using ElementCompute = ElementCompute_;
  using ElementLSE = ElementLSE_;

-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kCount = kElementsPerAccess;
-  static const ScaleType::Kind kScale =
+  static int constexpr kElementsPerAccess = ElementsPerAccess;
+  static int constexpr kCount = kElementsPerAccess;
+  static constexpr ScaleType::Kind kScale =
      cutlass::epilogue::thread::ScaleType::NoBetaScaling;

  using FragmentOutput = Array<ElementOutput, kCount>;
--- a/aten/src/ATen/test/pow_test.cpp
+++ b/aten/src/ATen/test/pow_test.cpp
@ -14,16 +14,16 @@ using namespace at;

 namespace {

-const auto int_min = std::numeric_limits<int>::min();
-const auto int_max = std::numeric_limits<int>::max();
-const auto long_min = std::numeric_limits<int64_t>::min();
-const auto long_max = std::numeric_limits<int64_t>::max();
-const auto float_lowest = std::numeric_limits<float>::lowest();
-const auto float_min = std::numeric_limits<float>::min();
-const auto float_max = std::numeric_limits<float>::max();
-const auto double_lowest = std::numeric_limits<double>::lowest();
-const auto double_min = std::numeric_limits<double>::min();
-const auto double_max = std::numeric_limits<double>::max();
+constexpr auto int_min = std::numeric_limits<int>::min();
+constexpr auto int_max = std::numeric_limits<int>::max();
+constexpr auto long_min = std::numeric_limits<int64_t>::min();
+constexpr auto long_max = std::numeric_limits<int64_t>::max();
+constexpr auto float_lowest = std::numeric_limits<float>::lowest();
+constexpr auto float_min = std::numeric_limits<float>::min();
+constexpr auto float_max = std::numeric_limits<float>::max();
+constexpr auto double_lowest = std::numeric_limits<double>::lowest();
+constexpr auto double_min = std::numeric_limits<double>::min();
+constexpr auto double_max = std::numeric_limits<double>::max();

 const std::vector<int> ints {
  int_min,
--- a/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
+++ b/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
@ -146,9 +146,9 @@ uint64_t XPUGeneratorImpl::seed() {

 c10::intrusive_ptr<c10::TensorImpl> XPUGeneratorImpl::get_state() const {
  // The RNG state comprises the seed, and an offset used for Philox.
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(uint64_t);
-  static const size_t total_size = seed_size + offset_size;
+  constexpr size_t seed_size = sizeof(uint64_t);
+  constexpr size_t offset_size = sizeof(uint64_t);
+  constexpr size_t total_size = seed_size + offset_size;

  // The internal state is returned as a CPU byte tensor.
  auto state_tensor = at::detail::empty_cpu(
@ -170,9 +170,9 @@ c10::intrusive_ptr<c10::TensorImpl> XPUGeneratorImpl::get_state() const {
 void XPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
  at::xpu::assertNotCapturing(
      "Please ensure to utilize the XPUGeneratorImpl::set_state_index method during capturing.");
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(uint64_t);
-  static const size_t total_size = seed_size + offset_size;
+  constexpr size_t seed_size = sizeof(uint64_t);
+  constexpr size_t offset_size = sizeof(uint64_t);
+  constexpr size_t total_size = seed_size + offset_size;

  at::detail::check_rng_state(new_state);

--- a/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv
+++ b/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv
--- a/benchmarks/operator_benchmark/pt/conv_test.py
+++ b/benchmarks/operator_benchmark/pt/conv_test.py
@ -38,12 +38,16 @@ class ConvTranspose1dBenchmark(op_bench.TorchBenchmarkBase):
 op_bench.generate_pt_test(
    configs.conv_1d_configs_short + configs.conv_1d_configs_long, Conv1dBenchmark
 )
-op_bench.generate_pt_test(
-    configs.convtranspose_1d_configs_short
-    + configs.conv_1d_configs_short
-    + configs.conv_1d_configs_long,
-    ConvTranspose1dBenchmark,
-)
+
+
+if not torch.backends.mkldnn.is_acl_available():
+    # convtranpose1d crashes with ACL, see https://github.com/pytorch/pytorch/issues/165654
+    op_bench.generate_pt_test(
+        configs.convtranspose_1d_configs_short
+        + configs.conv_1d_configs_short
+        + configs.conv_1d_configs_long,
+        ConvTranspose1dBenchmark,
+    )


 """
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -1044,6 +1044,17 @@ if(USE_ROCM)
       list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling)
    endif(CMAKE_BUILD_TYPE MATCHES Debug)

+    # Get EnVar 'USE_LAYERNORM_FAST_RECIPROCAL' (or default to on).
+    if(DEFINED ENV{USE_LAYERNORM_FAST_RECIPROCAL})
+      set(USE_LAYERNORM_FAST_RECIPROCAL $ENV{USE_LAYERNORM_FAST_RECIPROCAL})
+    else()
+      set(USE_LAYERNORM_FAST_RECIPROCAL ON)
+    endif()
+
+    if(USE_LAYERNORM_FAST_RECIPROCAL)
+      add_definitions(-DUSE_LAYERNORM_FAST_RECIPROCAL)
+    endif()
+
    # needed for compat with newer versions of hip-clang that introduced C++20 mangling rules
    list(APPEND HIP_HIPCC_FLAGS -fclang-abi-compat=17)

--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@ -128,11 +128,12 @@ function(caffe2_print_configuration_summary)
  endif()
  message(STATUS "  USE_ROCM              : ${USE_ROCM}")
  if(${USE_ROCM})
-    message(STATUS "    ROCM_VERSION          : ${ROCM_VERSION}")
-    message(STATUS "    USE_FLASH_ATTENTION   : ${USE_FLASH_ATTENTION}")
-    message(STATUS "    USE_MEM_EFF_ATTENTION : ${USE_MEM_EFF_ATTENTION}")
-    message(STATUS "    USE_ROCM_CK_SDPA      : ${USE_ROCM_CK_SDPA}")
-    message(STATUS "    USE_ROCM_CK_GEMM      : ${USE_ROCM_CK_GEMM}")
+    message(STATUS "    ROCM_VERSION                  : ${ROCM_VERSION}")
+    message(STATUS "    USE_FLASH_ATTENTION           : ${USE_FLASH_ATTENTION}")
+    message(STATUS "    USE_MEM_EFF_ATTENTION         : ${USE_MEM_EFF_ATTENTION}")
+    message(STATUS "    USE_ROCM_CK_SDPA              : ${USE_ROCM_CK_SDPA}")
+    message(STATUS "    USE_ROCM_CK_GEMM              : ${USE_ROCM_CK_GEMM}")
+    message(STATUS "    USE_LAYERNORM_FAST_RECIPROCAL : ${USE_LAYERNORM_FAST_RECIPROCAL}")
  endif()
  message(STATUS "  BUILD_NVFUSER         : ${BUILD_NVFUSER}")
  message(STATUS "  USE_EIGEN_FOR_BLAS    : ${CAFFE2_USE_EIGEN_FOR_BLAS}")
--- a/setup.py
+++ b/setup.py
@ -156,6 +156,10 @@
 #   USE_ROCM_KERNEL_ASSERT=1
 #     Enable kernel assert in ROCm platform
 #
+#   USE_LAYERNORM_FAST_RECIPROCAL
+#     If set, enables the use of builtin functions for fast reciprocals (1/x) w.r.t.
+#     layer normalization. Default: enabled.
+#
 #   USE_ROCM_CK_GEMM=1
 #     Enable building CK GEMM backend in ROCm platform
 #
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@ -721,20 +721,6 @@ class TestExport(TestCase):
                )
                self.assertEqual(node.meta["from_node"][-1].graph_id, graph_id)

-    def test_fx_annotate(self):
-        class Foo(torch.nn.Module):
-            def forward(self, x):
-                x += 1
-                with torch.fx.traceback.annotate({"a": "b"}):
-                    x += 1
-                x += 1
-                return x
-
-        ep = export(Foo(), (torch.randn(2),))
-
-        add_1 = list(ep.graph.nodes)[2]
-        self.assertTrue("custom" in add_1.meta and add_1.meta["custom"].get("a") == "b")
-
    @requires_gpu
    def test_flex_attention_export(self):
        from torch.nn.attention.flex_attention import create_block_mask, flex_attention
--- a/test/functorch/test_aot_joint_with_descriptors.py
+++ b/test/functorch/test_aot_joint_with_descriptors.py
@ -922,46 +922,6 @@ class inner_f(torch.nn.Module):
            in custom_metadata
        )

-    def test_preserve_annotate_function(self):
-        """Test basic annotate_fn usage"""
-
-        @fx_traceback.annotate_fn({"pp_stage": 1})
-        def example_function(x):
-            return x * x
-
-        class SimpleLinear(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = nn.Linear(3, 2)
-
-            def forward(self, x):
-                with fx_traceback.annotate({"pp_stage": 0}):
-                    y = self.linear(x)
-                y = example_function(y)
-                return y - 1
-
-        inputs = (torch.randn(4, 3),)
-        model = SimpleLinear()
-
-        for with_export in [True, False]:
-            graph_module = graph_capture(model, inputs, with_export)
-            custom_metadata = fx_traceback._get_custom_metadata(graph_module)
-            self.assertExpectedInline(
-                str(custom_metadata),
-                """\
-('call_function', 't', {'pp_stage': 0})
-('call_function', 'addmm', {'pp_stage': 0})
-('call_function', 'mul', {'pp_stage': 1})
-('call_function', 'mul_1', {'pp_stage': 1})
-('call_function', 'mul_2', {'pp_stage': 1})
-('call_function', 't_1', {'pp_stage': 0})
-('call_function', 'mm', {'pp_stage': 0})
-('call_function', 't_2', {'pp_stage': 0})
-('call_function', 'sum_1', {'pp_stage': 0})
-('call_function', 'view', {'pp_stage': 0})
-('call_function', 't_3', {'pp_stage': 0})""",
-            )
-

 if __name__ == "__main__":
    run_tests()
--- a/tools/linter/adapters/pip_init.py
+++ b/tools/linter/adapters/pip_init.py
@ -76,7 +76,7 @@ def main() -> None:
    if uv and (is_uv_managed_python or not need_user_flag):
        pip_args = [uv, "pip", "install"]
    elif sys.executable:
-        pip_args = [sys.executable, "-mpip", "install"]
+        pip_args = [sys.executable, "-m", "pip", "install"]
    else:
        pip_args = ["pip3", "install"]

--- a/torch/export/_trace.py
+++ b/torch/export/_trace.py
@ -812,10 +812,7 @@ def _export_to_torch_ir(
        prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
    )

-    with (
-        torch._dynamo.config.patch(dataclasses.asdict(dynamo_cfg)),
-        torch.fx.traceback.preserve_node_meta(),
-    ):
+    with torch._dynamo.config.patch(dataclasses.asdict(dynamo_cfg)):
        try:
            module_call_specs: dict[str, dict[str, pytree.TreeSpec]] = (
                _ExportModuleSpecTrackerDict()
@ -905,7 +902,6 @@ def _export_to_aten_ir(
        _ignore_backend_decomps(),
        _compiling_state_context(),
        custom_triton_ops_decomposition_ctx(),
-        torch.fx.traceback.preserve_node_meta(),
    ):
        gm, graph_signature = transform(aot_export_module)(
            mod,
@ -1934,8 +1930,9 @@ def _non_strict_export(
                            in mod._forward_pre_hooks.values()
                        ):
                            _check_input_constraints_pre_hook(mod, args, kwargs)
-                        args = (*args, *kwargs.values())
-                        tree_out = torch.fx.Interpreter(mod).run(*args)
+                        with torch.fx.traceback.preserve_node_meta():
+                            args = (*args, *kwargs.values())
+                            tree_out = torch.fx.Interpreter(mod).run(*args)
                    else:
                        tree_out = mod(*args, **kwargs)
                    flat_outs, out_spec = pytree.tree_flatten(tree_out)
@ -2032,7 +2029,6 @@ def _non_strict_export(
            ),
            _fakify_module_inputs(fake_args, fake_kwargs, fake_mode),
            _override_builtin_ops(),
-            torch.fx.traceback.preserve_node_meta(),
        ):
            aten_export_artifact = _to_aten_func(  # type: ignore[operator]
                patched_mod,
--- a/torch/fx/traceback.py
+++ b/torch/fx/traceback.py
@ -18,7 +18,6 @@ log = logging.getLogger(__name__)

 __all__ = [
    "annotate",
-    "annotate_fn",
    "preserve_node_meta",
    "has_preserved_node_meta",
    "set_stack_trace",
@ -292,42 +291,6 @@ def annotate(annotation_dict: dict):
            del current_meta["custom"]


-@compatibility(is_backward_compatible=False)
-def annotate_fn(annotation_dict: dict):
-    """
-    A decorator that wraps a function with the annotate context manager.
-    Use this when you want to annotate an entire function instead of a specific code block.
-
-    Note:
-        This API is **not backward compatible** and may evolve in future releases.
-
-    Note:
-        This API is not compatible with fx.symbolic_trace or jit.trace. It's intended
-        to be used with PT2 family of tracers, e.g. torch.export and dynamo.
-
-    Args:
-        annotation_dict (dict): A dictionary of custom key-value pairs to inject
-            into the FX trace metadata for all operations in the function.
-
-    Example:
-        >>> @annotate_fn({"pp_stage": 1})
-        ... def my_function(x):
-        ...     return x + 1
-        # All operations in my_function will have {"pp_stage": 1} in their metadata.
-    """
-    from functools import wraps
-
-    def decorator(func):
-        @wraps(func)
-        def wrapper(*args, **kwargs):
-            with annotate(annotation_dict):
-                return func(*args, **kwargs)
-
-        return wrapper
-
-    return decorator
-
-
@compatibility(is_backward_compatible=False)
 def set_grad_fn_seq_nr(seq_nr):
    global current_meta
--- a/torch/utils/collect_env.py
+++ b/torch/utils/collect_env.py
@ -640,9 +640,9 @@ def get_pip_packages(run_lambda, patterns=None):

    os.environ["PIP_DISABLE_PIP_VERSION_CHECK"] = "1"
    # People generally have pip as `pip` or `pip3`
-    # But here it is invoked as `python -mpip`
+    # But here it is invoked as `python -m pip`
    out = run_and_read_all(
-        run_lambda, [sys.executable, "-mpip", "list", "--format=freeze"]
+        run_lambda, [sys.executable, "-m", "pip", "list", "--format=freeze"]
    )
    if out is None:
        return pip_version, out
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@ -7702,8 +7702,11 @@ CUDA_IDENTIFIER_MAP = collections.OrderedDict(
        ("CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", ("HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", CONV_MATH_FUNC, API_BLAS)),
        ("CUBLASLT_MATMUL_DESC_A_SCALE_MODE", ("HIPBLASLT_MATMUL_DESC_A_SCALE_MODE", CONV_MATH_FUNC, API_BLAS)),
        ("CUBLASLT_MATMUL_DESC_B_SCALE_MODE", ("HIPBLASLT_MATMUL_DESC_B_SCALE_MODE", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_DESC_POINTER_MODE", ("HIPBLASLT_MATMUL_DESC_POINTER_MODE", CONV_MATH_FUNC, API_BLAS)),
        ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", CONV_MATH_FUNC, API_BLAS)),
        ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_POINTER_MODE_DEVICE", ("HIPBLASLT_POINTER_MODE_DEVICE", CONV_NUMERIC_LITERAL, API_BLAS)),
+        ("CUBLASLT_POINTER_MODE_HOST", ("HIPBLASLT_POINTER_MODE_HOST", CONV_NUMERIC_LITERAL, API_BLAS)),
        ("cublasLtMatrixLayout_t", ("hipblasLtMatrixLayout_t", CONV_MATH_FUNC, API_BLAS)),
        ("cublasLtMatrixLayoutOpaque_t", ("hipblasLtMatrixLayoutOpaque_t", CONV_MATH_FUNC, API_BLAS)),
        ("cublasLtMatrixLayoutAttribute_t", ("hipblasLtMatrixLayoutAttribute_t", CONV_MATH_FUNC, API_BLAS)),