Revert changes

More fixes
Fix clang-tidy warnings of performance
2025-10-24 07:27:32 +08:00 · 2025-09-21 09:20:22 +00:00 · 2025-09-21 09:20:22 +00:00 · 2025-09-21 09:20:22 +00:00 · 2025-09-21 03:01:04 +00:00 · 2025-09-21 01:45:46 +00:00
567 changed files with 17260 additions and 6898 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -31,8 +31,7 @@ pip install -r /pytorch/requirements.txt
 pip install auditwheel==6.2.0 wheel
 if [ "$DESIRED_CUDA" = "cpu" ]; then
    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
-    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
-    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
+    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
 else
    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
    export USE_SYSTEM_NCCL=1
@ -46,6 +45,5 @@ else
        export USE_NVIDIA_PYPI_LIBS=1
    fi

-    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
-    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
+    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
 fi
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -317,7 +317,7 @@ if __name__ == "__main__":
    ).decode()

    print("Building PyTorch wheel")
-    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
+    build_vars = ""
    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
    if enable_cuda:
        build_vars += "MAX_JOBS=5 "
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-56392aa978594cc155fa8af48cd949f5b5f1823a
+e0dda9059d082537cee36be6c5e4fe3b18c880c0
--- a/.ci/docker/ci_commit_pins/huggingface-requirements.txt
+++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
@ -1,2 +1,2 @@
-transformers==4.54.0
+transformers==4.56.0
 soxr==0.5.0
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -42,22 +42,27 @@ install_pip_dependencies() {
  # A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current
  # numba and scipy version used in PyTorch CI
  conda_run pip uninstall -y numba scipy
+  # Yaspin is needed for running CI test (get_benchmark_analysis_data.py)
+  pip_install yaspin==3.1.0

  popd
 }

 setup_executorch() {
-  pushd executorch
-
  export PYTHON_EXECUTABLE=python
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON"

  as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
-  popd
 }

-clone_executorch
-install_buck2
-install_conda_dependencies
-install_pip_dependencies
-setup_executorch
+if [ $# -eq 0 ]; then
+  clone_executorch
+  install_buck2
+  install_conda_dependencies
+  install_pip_dependencies
+  pushd executorch
+  setup_executorch
+  popd
+else
+  "$@"
+fi
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -93,8 +93,9 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Pinned versions:
 #test that import:

-mypy==1.16.0
+mypy==1.16.0 ; platform_system != "Windows"
 # Pin MyPy version because new errors are likely to appear with each release
+# Skip on Windows as lots of type annotations are POSIX specific
 #Description: linter
 #Pinned versions: 1.16.0
 #test that import: test_typing.py, test_type_hints.py
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@ -41,7 +41,6 @@ def sample_vllm_test_library():
                "pytest -v -s basic_correctness/test_cumem.py",
                "pytest -v -s basic_correctness/test_basic_correctness.py",
                "pytest -v -s basic_correctness/test_cpu_offload.py",
-                "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
            ],
        },
        "vllm_basic_models_test": {
@ -68,14 +67,11 @@ def sample_vllm_test_library():
                        "-v",
                        "-s",
                        "entrypoints/llm",
-                        "--ignore=entrypoints/llm/test_lazy_outlines.py",
                        "--ignore=entrypoints/llm/test_generate.py",
-                        "--ignore=entrypoints/llm/test_generate_multiple_loras.py",
                        "--ignore=entrypoints/llm/test_collective_rpc.py",
                    ]
                ),
-                "pytest -v -s entrypoints/llm/test_lazy_outlines.py",
-                "pytest -v -s entrypoints/llm/test_generate.py ",
+                "pytest -v -s entrypoints/llm/test_generate.py",
                "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
            ],
        },
--- a/.ci/pytorch/functorch_doc_push_script.sh
+++ b/.ci/pytorch/functorch_doc_push_script.sh
@ -1,40 +0,0 @@
-#!/bin/bash
-
-# This is where the local pytorch install in the docker image is located
-pt_checkout="/var/lib/jenkins/workspace"
-source "$pt_checkout/.ci/pytorch/common_utils.sh"
-echo "functorch_doc_push_script.sh: Invoked with $*"
-
-set -ex -o pipefail
-
-version=${DOCS_VERSION:-nightly}
-echo "version: $version"
-
-# Build functorch docs
-pushd $pt_checkout/functorch/docs
-make html
-popd
-
-git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
-pushd functorch_ghpages
-
-if [ "$version" == "main" ]; then
-  version=nightly
-fi
-
-git rm -rf "$version" || true
-mv "$pt_checkout/functorch/docs/build/html" "$version"
-
-git add "$version" || true
-git status
-git config user.email "soumith+bot@pytorch.org"
-git config user.name "pytorchbot"
-# If there aren't changes, don't make a commit; push is no-op
-git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
-git status
-
-if [[ "${WITH_PUSH:-}" == true ]]; then
-  git push -u origin gh-pages
-fi
-
-popd
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -59,7 +59,7 @@ test_python_shard() {

  setup_test_python

-  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "$1" "$NUM_TEST_SHARDS"
+  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS"

  assert_git_not_dirty
 }
--- a/.ci/pytorch/numba-cuda-13.patch
+++ b/.ci/pytorch/numba-cuda-13.patch
@ -0,0 +1,25 @@
+From 6e08c9d08e9de59c7af28b720289debbbd384764 Mon Sep 17 00:00:00 2001
+From: Michael Wang <13521008+isVoid@users.noreply.github.com>
+Date: Tue, 1 Apr 2025 17:28:05 -0700
+Subject: [PATCH] Avoid bumping certain driver API to avoid future breakage
+ (#185)
+
+Co-authored-by: isVoid <isVoid@users.noreply.github.com>
+---
+ numba_cuda/numba/cuda/cudadrv/driver.py | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
+index 1641bf77..233e9ed7 100644
+--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
+@@ -365,6 +365,9 @@ def _find_api(self, fname):
+         else:
+             variants = ('_v2', '')
+ 
+        if fname in ("cuCtxGetDevice", "cuCtxSynchronize"):
+            return getattr(self.lib, fname)
+
+         for variant in variants:
+             try:
+                 return getattr(self.lib, f'{fname}{variant}')
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -32,6 +32,16 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v
  git config --global --add safe.directory /var/lib/jenkins/workspace
 fi

+
+# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
+NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
+if [ -n "$NUMBA_CUDA_DIR" ]; then
+  NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
+  pushd "$NUMBA_CUDA_DIR"
+  patch -p4 <"$NUMBA_PATCH"
+  popd
+fi
+
 echo "Environment variables:"
 env

@ -312,14 +322,14 @@ test_python_shard() {

  # modify LD_LIBRARY_PATH to ensure it has the conda env.
  # This set of tests has been shown to be buggy without it for the split-build
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running

  assert_git_not_dirty
 }

 test_python() {
  # shellcheck disable=SC2086
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
  assert_git_not_dirty
 }

@ -374,7 +384,6 @@ test_dynamo_wrapped_shard() {
    --exclude-distributed-tests \
    --exclude-torch-export-tests \
    --exclude-aot-dispatch-tests \
-    --exclude-quantization-tests \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose \
    --upload-artifacts-while-running
@ -1147,12 +1156,6 @@ test_distributed() {
  fi
 }

-test_quantization() {
-  echo "Testing quantization"
-
-  python test/test_quantization.py
-}
-
 test_rpc() {
  echo "Testing RPC C++ tests"
  # NB: the ending test_rpc must match the current function name for the current
@ -1547,14 +1550,10 @@ test_executorch() {
  install_torchvision
  install_torchaudio

+  INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh"
+
  pushd /executorch
-
-  export PYTHON_EXECUTABLE=python
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
-
-  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
-  # from the PR
-  bash .ci/scripts/setup-linux.sh --build-tool cmake
+  "${INSTALL_SCRIPT}" setup_executorch

  echo "Run ExecuTorch unit tests"
  pytest -v -n auto
@ -1568,10 +1567,6 @@ test_executorch() {

  popd

-  # Test torchgen generated code for Executorch.
-  echo "Testing ExecuTorch op registration"
-  "$BUILD_BIN_DIR"/test_edge_op_registration
-
  assert_git_not_dirty
 }

@ -1579,6 +1574,7 @@ test_linux_aarch64() {
  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
        test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
+        distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

  # Dynamo tests
@ -1653,8 +1649,6 @@ elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
  test_executorch
 elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
  test_python_legacy_jit
-elif [[ "$TEST_CONFIG" == 'quantization' ]]; then
-  test_quantization
 elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
  # TODO: run some C++ tests
  echo "no-op at the moment"
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -137,7 +137,7 @@ sccache --show-stats
 python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
 (
  if "%BUILD_ENVIRONMENT%"=="" (
-    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
+    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash.
  ) else (
    copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%"

--- a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
@ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" (
 ) else (
  set CONDA_PARENT_DIR=C:\Jenkins
 )
-
+set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3

 :: Be conservative here when rolling out the new AMI with conda. This will try
 :: to install conda as before if it couldn't find the conda installation. This
 :: can be removed eventually after we gain enough confidence in the AMI
-if not exist %CONDA_PARENT_DIR%\Miniconda3 (
+if not exist %CONDA_ROOT_DIR% (
  set INSTALL_FRESH_CONDA=1
 )

@ -17,10 +17,14 @@ if "%INSTALL_FRESH_CONDA%"=="1" (
  if errorlevel 1 exit /b
  if not errorlevel 0 exit /b

-  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
+  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR%
  if errorlevel 1 exit /b
  if not errorlevel 0 exit /b
 )

 :: Activate conda so that we can use its commands, i.e. conda, python, pip
-call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
+call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%
+:: Activate conda so that we can use its commands, i.e. conda, python, pip
+call conda activate py_tmp
+
+call pip install -r .ci/docker/requirements-ci.txt
--- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
@ -14,7 +14,7 @@ if not errorlevel 0 exit /b
 :: build\torch. Rather than changing all these references, making a copy of torch folder
 :: from conda to the current workspace is easier. The workspace will be cleaned up after
 :: the job anyway
-xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
+xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\

 pushd .
 if "%VC_VERSION%" == "" (
--- a/.ci/pytorch/win-test-helpers/test_python_shard.bat
+++ b/.ci/pytorch/win-test-helpers/test_python_shard.bat
@ -25,7 +25,7 @@ echo Copying over test times file
 robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"

 echo Run nn tests
-python run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
+python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
 if ERRORLEVEL 1 goto fail

 popd
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -38,7 +38,14 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi

 # TODO: Move both of them to Windows AMI
-python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
+python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
+
+# Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments
+# pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node'
+# scipy from 1.6.3 to 1.10
+# expecttest from 0.1.3 to 0.3.0
+# xdoctest from 1.0.2 to 1.3.0
+python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.3.0" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" "boto3==1.35.42"

 # Install Z3 optional dependency for Windows builds.
 python -m pip install z3-solver==4.15.1.0
@ -52,9 +59,6 @@ python -m pip install parameterized==0.8.1
 # Install pulp for testing ilps under torch\distributed\_tools
 python -m pip install pulp==2.9.0

-# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
-python -m pip install expecttest==0.3.0
-
 run_tests() {
    # Run nvidia-smi if available
    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
--- a/.github/actions/reuse-old-whl/reuse_old_whl.py
+++ b/.github/actions/reuse-old-whl/reuse_old_whl.py
@ -264,7 +264,7 @@ def unzip_artifact_and_replace_files() -> None:
        change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py")

        for file in Path(f"artifacts/dist/{old_stem}").glob(
-            "*.dist-info/**",
+            "*.dist-info/*",
        ):
            change_content_to_new_version(file)

--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@ -6,6 +6,12 @@ inputs:
  cuda-version:
    description: which cuda version to install, 'cpu' for none
    required: true
+  python-version:
+    required: false
+    type: string
+    default: "3.10"
+    description: |
+      The python version to be used. Will be 3.10 by default

 runs:
  using: composite
@ -38,18 +44,24 @@ runs:
        CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat"

        {
+          echo "CONDA=${CONDA}";
          echo "CONDA_RUN=${CONDA} run --no-capture-output";
          echo "CONDA_BUILD=${CONDA} run conda-build";
          echo "CONDA_INSTALL=${CONDA} install";
        } >> "${GITHUB_ENV}"

    - name: Setup Python3
+      env:
+          PYTHON_VERSION: ${{ inputs.python-version }}
      shell: bash
      run: |
        set +e
        set -x

-        PYTHON3=$(${CONDA_RUN} which python3)
+        # Create new py_tmp env with python-version
+        ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp
+
+        PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
        EXIT_CODE=$?

        if [[ "${EXIT_CODE}" == "0" ]]; then
@ -62,7 +74,7 @@ runs:
          # installation, which is Python 3 based. Its Python is default to Python 3. Further, there
          # is also the Miniconda installation that is Python 2 based, and both can be installed if
          # needed. In both cases, Python binary is just called python
-          PYTHON=$(${CONDA_RUN} which python)
+          PYTHON=$(${CONDA_RUN} -n py_tmp which python)
          EXIT_CODE=$?

          if [[ "${EXIT_CODE}" == "0" ]]; then
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-973c9d01da863cac9c51e8a5c0d390fc84b84fbc
+9d1c50a5ac8726f4af0d4a4e85ad4d26a674ad26
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -130,3 +130,6 @@
 - torch/csrc/inductor/aoti_include/**
 - torchgen/aoti/**
 - torchgen/gen_aoti_c_shim.py
+
+"ciflow/vllm":
+- .github/ci_commit_pins/vllm.txt
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -19,7 +19,6 @@ ciflow_push_tags:
 - ciflow/nightly
 - ciflow/periodic
 - ciflow/periodic-rocm-mi300
- ciflow/quantization-periodic
 - ciflow/rocm
 - ciflow/rocm-mi300
 - ciflow/s390
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -135,7 +135,7 @@ ROCM_SMOKE_WORKFLOWS = [
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
            arches=["6.4"],
-            python_versions=["3.9"],
+            python_versions=["3.10"],
        ),
        ciflow_config=CIFlowConfig(
            labels={
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -187,8 +187,6 @@ jobs:

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        with:
-          driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }}
        if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}

      - name: configure aws credentials
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@ -75,10 +75,6 @@ jobs:
            runner: ${{ inputs.runner_prefix }}linux.2xlarge
            # It takes less than 30m to finish python docs unless there are issues
            timeout-minutes: 30
-          - docs_type: functorch
-            runner: ${{ inputs.runner_prefix }}linux.2xlarge
-            # It takes less than 15m to finish functorch docs unless there are issues
-            timeout-minutes: 15
    # Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
    # The current name requires updating the database last docs push query from test-infra every time the matrix is updated
    name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
@ -211,16 +207,6 @@ jobs:
          path: cppdocs/
          s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs

-      - name: Upload functorch Docs Preview
-        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
-        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }}
-        with:
-          retention-days: 14
-          s3-bucket: doc-previews
-          if-no-files-found: error
-          path: functorch_ghpages/nightly/
-          s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs
-
      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
        if: always()
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -169,7 +169,7 @@ jobs:
        id: install-nvidia-driver
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
        with:
-          driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
+          driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }}
        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}

      - name: Setup GPU_FLAG for docker run
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -62,6 +62,11 @@ on:
        required: false
        type: number
        default: 1
+    secrets:
+      HUGGING_FACE_HUB_TOKEN:
+        required: false
+        description: |
+          HF Auth token to avoid rate limits when downloading models or datasets from hub
 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}

@ -76,10 +81,9 @@ jobs:
    strategy:
      matrix: ${{ fromJSON(inputs.test-matrix) }}
      fail-fast: false
-    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
    runs-on: ${{ matrix.runner }}
+    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
    steps:
-      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
@ -131,6 +135,9 @@ jobs:

      - name: Start monitoring script
        id: monitor-script
+        if: ${{ !inputs.disable-monitor }}
+        shell: bash
+        continue-on-error: true
        env:
          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
@ -138,9 +145,6 @@ jobs:
          WORKFLOW_RUN_ID: ${{github.run_id}}
          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
-        if: ${{ !inputs.disable-monitor }}
-        shell: bash
-        continue-on-error: true
        run: |
          python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
          python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
@ -178,6 +182,12 @@ jobs:
        run: |
          echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"

+      - name: Preserve github env variables for use in docker
+        shell: bash
+        run: |
+          env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+          env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+
      - name: Test
        id: test
        env:
@ -193,20 +203,22 @@ jobs:
          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
          BRANCH: ${{ steps.parse-ref.outputs.branch }}
          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
+          TEST_CONFIG: ${{ matrix.config }}
+          SHARD_NUMBER: ${{ matrix.shard }}
+          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
          TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
-          TEST_CONFIG: ${{ matrix.config }}
-          SHARD_NUMBER: ${{ matrix.shard }}
-          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
          DOCKER_IMAGE: ${{ inputs.docker-image }}
          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
          TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
          DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
        run: |
          set -x
@ -236,6 +248,7 @@ jobs:
            -e GITHUB_RUN_ATTEMPT \
            -e JOB_ID \
            -e JOB_NAME \
+            -e BASE_SHA \
            -e BRANCH \
            -e SHA1 \
            -e AWS_DEFAULT_REGION \
@ -253,10 +266,12 @@ jobs:
            -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
            -e TESTS_TO_INCLUDE \
+            -e HUGGING_FACE_HUB_TOKEN \
            -e DASHBOARD_TAG \
            --env-file="${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}" \
            --ulimit stack=10485760:83886080 \
            --ulimit core=0 \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --security-opt seccomp=unconfined \
            --cap-add=SYS_PTRACE \
            --shm-size="8g" \
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -151,7 +151,7 @@ jobs:
          BUILD_WHEEL: 1
          MAX_JOBS: 8
          CUDA_VERSION: ${{ inputs.cuda-version }}
-          PYTHON_VERSION: "3.9"
+          PYTHON_VERSION: "3.10"
          SCCACHE_BUCKET: "ossci-compiler-cache"
          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
          SCCACHE_REGION: us-east-1
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -184,7 +184,7 @@ jobs:
        env:
          USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
          INSTALL_WINDOWS_SDK: 1
-          PYTHON_VERSION: 3.9
+          PYTHON_VERSION: "3.10"
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
          TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
--- a/.github/workflows/build-vllm-wheel.yml
+++ b/.github/workflows/build-vllm-wheel.yml
@ -178,12 +178,12 @@ jobs:
      contents: read
    container:
      image: continuumio/miniconda3:4.12.0
-    environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }}
+    environment: ${{ ((github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && 'nightly-wheel-upload' || '' }}
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Configure AWS credentials(PyTorch account) for main
-        if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
+        if: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
        with:
          role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -71,8 +71,7 @@ jobs:
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
-          # Executorch pin needs update
-          # pytorch-linux-jammy-py3-clang12-executorch,
+          pytorch-linux-jammy-py3-clang12-executorch,
          pytorch-linux-jammy-py3.12-triton-cpu,
          pytorch-linux-noble-riscv64-py3.12-gcc14
        ]
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@ -44,7 +44,7 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_9-rocm6_4-build:
+  manywheel-py3_10-rocm6_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    needs: get-label-type
@ -58,16 +58,16 @@ jobs:
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-rocm6_4
+      build_name: manywheel-py3_10-rocm6_4
      build_environment: linux-binary-manywheel-rocm
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-rocm6_4-test:  # Testing
+  manywheel-py3_10-rocm6_4-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
-      - manywheel-py3_9-rocm6_4-build
+      - manywheel-py3_10-rocm6_4-build
      - get-label-type
    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
@ -82,14 +82,14 @@ jobs:
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
-          name: manywheel-py3_9-rocm6_4
+          name: manywheel-py3_10-rocm6_4
          path: "${{ runner.temp }}/artifacts/"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
--- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@ -43,6 +43,11 @@ on:
        required: false
        type: boolean
        default: false
+      freezing:
+        description: Run freezing?
+        required: false
+        type: boolean
+        default: true
      benchmark_configs:
        description: The list of configs used the benchmark
        required: false
@ -102,7 +107,7 @@ jobs:
    if: github.event.schedule == '0 7 * * *'
    with:
      build-environment: linux-jammy-py3.10-gcc11-build
-      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
+      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
@ -116,10 +121,9 @@ jobs:
    name: inductor-test
    uses: ./.github/workflows/_linux-test.yml
    needs: inductor-build
-    if: github.event_name == 'workflow_dispatch'
    with:
      build-environment: linux-jammy-py3.10-gcc11-build
-      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
+      dashboard-tag: training-${{ inputs.training || 'false' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'true' }}-aotinductor-${{ inputs.aotinductor || 'true' }}-freezing-${{ inputs.freezing || 'true' }}
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -105,7 +105,7 @@ jobs:
      # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
      # to run git rev-parse HEAD~:.ci/docker when a new image is needed
      fetch-depth: 0
-      submodules: false
+      submodules: true
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      script: |
        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -127,6 +127,8 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
+      # More memory is needed to build with asan
+      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
@ -316,32 +318,6 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3-clang12-executorch-build:
-    if: false  # Docker build needs pin update
-    name: linux-jammy-py3-clang12-executorch
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3-clang12-executorch
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
-      test-matrix: |
-        { include: [
-          { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-py3-clang12-executorch-test:
-    name: linux-jammy-py3-clang12-executorch
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3-clang12-executorch-build
-    if: false # Has been broken for a while
-    with:
-      build-environment: linux-jammy-py3-clang12-executorch
-      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
-    secrets: inherit
-
  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
    name: cuda12.8-py3.10-gcc9-sm75
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/quantization-periodic.yml
+++ b/.github/workflows/quantization-periodic.yml
@ -1,54 +0,0 @@
-name: quantization-periodic
-
-on:
-  push:
-    tags:
-      - ciflow/quantization-periodic/*
-  workflow_dispatch:
-  schedule:
-    # run weekly
-    - cron: "45 0 * * 0"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  get-default-label-prefix:
-    name: get-default-label-prefix
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-      opt_out_experiments: lf
-
-  periodic-quantization-build:
-    name: periodic-quantization-build
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-default-label-prefix
-    with:
-      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: '8.9'
-      test-matrix: |
-        { include: [
-          { config: "quantization", shard: 1, num_shards: 1, runner: "${{ needs.get-default-label-prefix.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-        ]}
-    secrets: inherit
-  periodic-test-quantization:
-    name: periodic-test-quantization
-    uses: ./.github/workflows/_linux-test.yml
-    needs: periodic-quantization-build
-    with:
-      build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      docker-image: ${{ needs.periodic-quantization-build.outputs.docker-image }}
-      test-matrix: ${{ needs.periodic-quantization-build.outputs.test-matrix }}
-    secrets: inherit
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -140,6 +140,8 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
+      # More memory is needed to build with asan
+      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -259,3 +259,27 @@ jobs:
      docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
      test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
    secrets: inherit
+
+  linux-jammy-py3-clang12-executorch-build:
+    name: linux-jammy-py3-clang12-executorch
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3-clang12-executorch
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
+      test-matrix: |
+        { include: [
+          { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-py3-clang12-executorch-test:
+    name: linux-jammy-py3-clang12-executorch
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-py3-clang12-executorch-build
+    with:
+      build-environment: linux-jammy-py3-clang12-executorch
+      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
+    secrets: inherit
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@ -36,6 +36,8 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
+      # When building vLLM, uv doesn't like that we rename wheel without changing the wheel metadata
+      allow-reuse-old-whl: false
      build-additional-packages: "vision audio"
      build-external-packages: "vllm"
      build-environment: linux-jammy-cuda12.8-py3.12-gcc11
--- a/.gitignore
+++ b/.gitignore
@ -259,6 +259,9 @@ gen
 .pytest_cache
 aten/build/*

+# Linker scripts for prioritized text optimization
+cmake/linker_script.ld
+
 # Bram
 plsdontbreak

--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -123,6 +123,7 @@ is_formatter = true
 code = 'MYPY'
 include_patterns = [
    'setup.py',
+    'functorch/dim/**/*.py',
    'torch/**/*.py',
    'torch/**/*.pyi',
    'caffe2/**/*.py',
@ -964,7 +965,6 @@ exclude_patterns = [
    'test/jit/**',  # should be run through test/test_jit.py
    'test/ao/sparsity/**',  # should be run through test/test_ao_sparsity.py
    'test/fx/**',  # should be run through test/test_fx.py
-    'test/bottleneck_test/**',  # excluded by test/run_test.py
    'test/package/**',  # excluded by test/run_test.py
    'test/distributed/argparse_util_test.py',
    'test/distributed/bin/test_script.py',
@ -1410,8 +1410,6 @@ exclude_patterns = [
    'torch/utils/benchmark/utils/timer.py',
    'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
    'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
-    'torch/utils/bottleneck/__init__.py',
-    'torch/utils/bottleneck/__main__.py',
    'torch/utils/bundled_inputs.py',
    'torch/utils/checkpoint.py',
    'torch/utils/collect_env.py',
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -380,6 +380,13 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
                       OFF "USE_CUDA" OFF)
 cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
                        "CPU_AARCH64" OFF)
+# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le.
+set(USE_PRIORITIZED_TEXT_DEFAULT OFF)
+if(LINUX AND CPU_AARCH64)
+  set(USE_PRIORITIZED_TEXT_DEFAULT ON)
+endif()
+cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld."
+  "${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF)

 option(USE_MIMALLOC "Use mimalloc" OFF)
 # Enable third party mimalloc library to improve memory allocation performance
@ -657,6 +664,11 @@ endif(MSVC)

 string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")

+# Set linker max-page-size to 64KiB on AArch64 Linux
+if(LINUX AND CPU_AARCH64)
+  add_link_options_if_supported("-z,max-page-size=0x10000")
+endif()
+
 # Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
 # applicable to mobile are disabled by this variable. Setting
 # `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
@ -891,7 +903,7 @@ IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
 endif()

 # Set USE_FBGEMM_GENAI to ON for CUDA build on SM100.
-if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
+if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
  message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a")
  set(USE_FBGEMM_GENAI ON)
 endif()
@ -1421,3 +1433,57 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
  install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
          DESTINATION "${CMAKE_INSTALL_BINDIR}")
 endif()
+
+if(USE_PRIORITIZED_TEXT_FOR_LD)
+  add_compile_options(
+    $<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
+    $<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
+  )
+  set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
+  set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
+
+  add_custom_command(
+    OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
+    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
+    DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
+    COMMENT "Generating prioritized text linker files"
+    VERBATIM
+  )
+
+  add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
+
+  if(BUILD_PYTHON)
+    set(LINKER_OPT_TARGETS torch_python)
+  endif()
+
+  if(NOT BUILD_LIBTORCHLESS)
+    list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
+    if(USE_CUDA)
+      list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
+    endif()
+    if(USE_XPU)
+      list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
+    endif()
+    if(USE_ROCM)
+      list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
+    endif()
+  endif()
+
+  foreach(tgt IN LISTS LINKER_OPT_TARGETS)
+    if(TARGET ${tgt})
+      add_dependencies("${tgt}" generate_linker_script)
+      target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
+      set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
+    else()
+       message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
+    endif()
+  endforeach()
+
+else()
+  if(LINUX AND CPU_AARCH64)
+    message(WARNING [[
+    It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
+    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
+    ]])
+  endif()
+endif()
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -180,7 +180,7 @@ void Context::setUserEnabledNNPACK(bool e) {
 }

 bool Context::allowTF32CuDNN(const std::string& op) const {
-  if (op.size() == 0){
+  if (op.empty()){
    bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32";
    bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32";
    TORCH_CHECK(
@ -281,9 +281,6 @@ bool Context::userEnabledOverrideableSDP() const {

 static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
 static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
-#ifdef USE_ROCM
-static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32";
-#endif

 bool Context::checkCuBLASConfigDeterministic() {
  // If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
@ -343,12 +340,6 @@ void Context::setImmediateMiopen(bool b) {
 }

 bool Context::allowTF32CuBLAS() const {
-#ifdef USE_ROCM
-    const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
-    if (allow_tf32 != true) {
-      return false;
-    }
-#endif
  bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
  bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32";
  TORCH_CHECK(
@ -362,14 +353,6 @@ bool Context::allowTF32CuBLAS() const {
 }

 void Context::setAllowTF32CuBLAS(bool b) {
-#ifdef USE_ROCM
-  const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
-  if (allow_tf32 != true) {
-    C10_LOG_FIRST_N(INFO, 10) << "torch.backends.cuda.matmul.allow_tf32 is not supported on ROCm by default. "
-                              << "Please set environment variable HIPBLASLT_ALLOW_TF32=1 to enable it.";
-    return;
-  }
-#endif
  float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
  setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee");
 }
@ -443,7 +426,7 @@ void Context::setFloat32Precision(const std::string& backend, const std::string&
    std::string msg;
    auto iterp = _fp32_precisions.find(backend);
    TORCH_CHECK(iterp != _fp32_precisions.end());
-    for (auto p : iterp->second) {
+    for (const auto& p : iterp->second) {
      msg += p;
      msg += " ";
    }
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@ -65,14 +65,24 @@ DLDataType getDLDataType(const Tensor& t) {
      break;
    // TODO(#146647): use macro here instead of spelling out each shell dtype
    case ScalarType::Float8_e5m2:
+      dtype.code = DLDataTypeCode::kDLFloat8_e5m2;
+      break;
    case ScalarType::Float8_e5m2fnuz:
+      dtype.code = DLDataTypeCode::kDLFloat8_e5m2fnuz;
+      break;
    case ScalarType::Float8_e4m3fn:
+      dtype.code = DLDataTypeCode::kDLFloat8_e4m3fn;
+      break;
    case ScalarType::Float8_e4m3fnuz:
+      dtype.code = DLDataTypeCode::kDLFloat8_e4m3fnuz;
+      break;
    case ScalarType::Float8_e8m0fnu:
-      TORCH_CHECK_BUFFER(false, "float8 types are not supported by dlpack");
+      dtype.code = DLDataTypeCode::kDLFloat8_e8m0fnu;
      break;
    case ScalarType::Float4_e2m1fn_x2:
-      TORCH_CHECK_BUFFER(false, "float4 types are not supported by dlpack");
+      dtype.code = DLDataTypeCode::kDLFloat4_e2m1fn;
+      dtype.lanes = 2;
+      dtype.bits = 4;
      break;
    case ScalarType::QInt8:
    case ScalarType::QUInt8:
@ -177,7 +187,11 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat

 ScalarType toScalarType(const DLDataType& dtype) {
  ScalarType stype = ScalarType::Undefined;
-  TORCH_CHECK_BUFFER(dtype.lanes == 1, "ATen does not support lanes != 1");
+  if (dtype.code != DLDataTypeCode::kDLFloat4_e2m1fn) {
+    TORCH_CHECK_BUFFER(
+        dtype.lanes == 1,
+        "ATen does not support lanes != 1 for dtype code", std::to_string(dtype.code));
+  }
  switch (dtype.code) {
    case DLDataTypeCode::kDLUInt:
      switch (dtype.bits) {
@ -269,6 +283,73 @@ ScalarType toScalarType(const DLDataType& dtype) {
              false, "Unsupported kDLBool bits ", std::to_string(dtype.bits));
      }
      break;
+    case DLDataTypeCode::kDLFloat8_e5m2:
+      switch (dtype.bits) {
+        case 8:
+          stype = ScalarType::Float8_e5m2;
+          break;
+        default:
+          TORCH_CHECK_BUFFER(
+              false, "Unsupported kDLFloat8_e5m2 bits ", std::to_string(dtype.bits));
+      }
+      break;
+    case DLDataTypeCode::kDLFloat8_e5m2fnuz:
+      switch (dtype.bits) {
+        case 8:
+          stype = ScalarType::Float8_e5m2fnuz;
+          break;
+        default:
+          TORCH_CHECK_BUFFER(
+              false, "Unsupported kDLFloat8_e5m2fnuz bits ", std::to_string(dtype.bits));
+      }
+      break;
+    case DLDataTypeCode::kDLFloat8_e4m3fn:
+      switch (dtype.bits) {
+        case 8:
+          stype = ScalarType::Float8_e4m3fn;
+          break;
+        default:
+          TORCH_CHECK_BUFFER(
+              false, "Unsupported kDLFloat8_e4m3fn bits ", std::to_string(dtype.bits));
+      }
+      break;
+    case DLDataTypeCode::kDLFloat8_e4m3fnuz:
+      switch (dtype.bits) {
+        case 8:
+          stype = ScalarType::Float8_e4m3fnuz;
+          break;
+        default:
+          TORCH_CHECK_BUFFER(
+              false, "Unsupported kDLFloat8_e4m3fnuz bits ", std::to_string(dtype.bits));
+      }
+      break;
+    case DLDataTypeCode::kDLFloat8_e8m0fnu:
+      switch (dtype.bits) {
+        case 8:
+          stype = ScalarType::Float8_e8m0fnu;
+          break;
+        default:
+          TORCH_CHECK_BUFFER(
+              false, "Unsupported kDLFloat8_e8m0fnu bits ", std::to_string(dtype.bits));
+      }
+      break;
+    case DLDataTypeCode::kDLFloat4_e2m1fn:
+      switch (dtype.bits) {
+        case 4:
+          switch (dtype.lanes) {
+            case 2:
+              stype = ScalarType::Float4_e2m1fn_x2;
+              break;
+            default:
+              TORCH_CHECK_BUFFER(
+                false, "Unsupported kDLFloat4_e2m1fn lanes ", std::to_string(dtype.lanes));
+          }
+          break;
+        default:
+          TORCH_CHECK_BUFFER(
+              false, "Unsupported kDLFloat4_e2m1fn bits ", std::to_string(dtype.bits));
+      }
+      break;
    default:
      TORCH_CHECK_BUFFER(false, "Unsupported code ", std::to_string(dtype.code));
  }
@ -354,8 +435,8 @@ T* toDLPackImpl(const Tensor& src) {
  atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device());
  atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim());
  atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
-  atDLMTensor->tensor.dl_tensor.shape = view.sizes().data();
-  atDLMTensor->tensor.dl_tensor.strides = view.strides().data();
+  atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(view.sizes().data());
+  atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(view.strides().data());
  atDLMTensor->tensor.dl_tensor.byte_offset = 0;
  fillVersion(&atDLMTensor->tensor);

--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@ -102,7 +102,7 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
  // SparseTensorImpl has no storage, so we cannot query its nbytes.
  // (original_storage_size is only used for storage resizing in fsdp anyway, which does not apply to sparse)
  // Same for XLA
-  if (base.unsafeGetTensorImpl()->has_storage() && base.device().type() != c10::DeviceType::XLA) {
+  if (base.unsafeGetTensorImpl()->has_storage() && data_ptr().device().type() != c10::DeviceType::XLA) {
    original_storage_size_ = base.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes();
  } else {
    original_storage_size_ = -1;
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -133,7 +133,7 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const
  : c10::TensorImpl(
      c10::DispatchKeySet(DispatchKey::Functionalize),
      view_value.dtype(),
-      view_value.device()
+      base->storage().data_ptr().device()
    ),
    value_(view_value),
    is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
@ -485,7 +485,10 @@ void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptr<TensorI


 c10::Device FunctionalTensorWrapper::device_custom() const {
-  return value_.unsafeGetTensorImpl()->device();
+  // The storage pointer already uses the underlying tensor custom device (if
+  // applicable) to extract the device. So, we dont have to recurse again by
+  // doing value_.unsafeGetTensorImpl()->device().
+  return storage().data_ptr().device();
 }
 at::IntArrayRef FunctionalTensorWrapper::sizes_custom() const {
  return value_.unsafeGetTensorImpl()->sizes();
--- a/aten/src/ATen/InferSize.h
+++ b/aten/src/ATen/InferSize.h
@ -94,10 +94,10 @@ inline at::DimVector infer_size_dv(IntArrayRef shape, int64_t numel) {

 inline at::SymDimVector infer_size_dv(
    c10::SymIntArrayRef shape,
-    c10::SymInt numel) {
+    const c10::SymInt& numel) {
  auto res = at::SymDimVector(shape);
  infer_size_impl<c10::SymIntArrayRef, c10::SymInt, at::SymDimVector>(
-      shape, std::move(numel), res);
+      shape, numel, res);
  return res;
 }

--- a/aten/src/ATen/core/Dict.h
+++ b/aten/src/ATen/core/Dict.h
@ -6,7 +6,6 @@
 #include <c10/util/TypeList.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/order_preserving_flat_hash_map.h>
-#include <optional>
 #include <ATen/core/TensorBody.h>
 #include <ATen/core/jit_type_base.h>

--- a/aten/src/ATen/core/dispatch/CppSignature.h
+++ b/aten/src/ATen/core/dispatch/CppSignature.h
@ -55,8 +55,7 @@ class TORCH_API CppSignature final {
  }

 private:
-  explicit CppSignature(std::type_index signature)
-      : signature_(std::move(signature)) {}
+  explicit CppSignature(std::type_index signature) : signature_(signature) {}
  std::type_index signature_;
 };

--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@ -70,7 +70,7 @@ private:
 void _print_dispatch_trace(const std::string& label, const std::string& op_name, const DispatchKeySet& dispatchKeySet) {
  auto nesting_value = dispatch_trace_nesting_value();
  for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
-  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
+  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << '\n';
 }
 } // namespace detail

@ -213,9 +213,11 @@ OperatorHandle Dispatcher::findOrRegisterName_(const OperatorName& op_name) {
 // Windows build doesn't produce the destructor symbol in PyTorch libs
 // causing a linker failure in downstream projects.
 // x-ref https://github.com/pytorch/pytorch/issues/70032
+#if defined(_WIN32)
 OperatorHandle::~OperatorHandle() = default;
+#endif

-RegistrationHandleRAII Dispatcher::registerLibrary(std::string ns, std::string debug) {
+RegistrationHandleRAII Dispatcher::registerLibrary(const std::string& ns, std::string debug) {
  std::lock_guard<std::mutex> lock(guard_->mutex);
  auto found = libraries_.find(ns);
  TORCH_CHECK(
@ -306,7 +308,7 @@ PythonModuleMapType& pythonModulesSingleton() {

 }

-std::optional<std::pair<const char*, const char*>> Dispatcher::getPyStub(OperatorName op_name) {
+std::optional<std::pair<const char*, const char*>> Dispatcher::getPyStub(const OperatorName& op_name) {
  std::lock_guard<std::mutex> lock(guard_->mutex);
  auto found = pythonModulesSingleton().find(op_name);
  if (found == pythonModulesSingleton().end()) {
@ -342,7 +344,7 @@ RegistrationHandleRAII Dispatcher::registerPythonModule(
  });
 }

-void Dispatcher::throwIfHasPythonModule(OperatorName op_name) {
+void Dispatcher::throwIfHasPythonModule(const OperatorName& op_name) {
  std::lock_guard<std::mutex> lock(guard_->mutex);
  auto elt = pythonModulesSingleton().find(op_name);
  if (elt == pythonModulesSingleton().end()) {
@ -362,7 +364,7 @@ void Dispatcher::throwIfHasPythonModule(OperatorName op_name) {
 }

 RegistrationHandleRAII Dispatcher::registerImpl(
-  OperatorName op_name,
+  const OperatorName& op_name,
  std::optional<DispatchKey> dispatch_key,
  KernelFunction kernel,
  std::optional<impl::CppSignature> cpp_signature,
@ -377,7 +379,7 @@ RegistrationHandleRAII Dispatcher::registerImpl(
    *this,
    dispatch_key,
    std::move(kernel),
-    std::move(cpp_signature),
+    cpp_signature,
    std::move(inferred_function_schema),
    std::move(debug)
  );
@ -406,7 +408,7 @@ void Dispatcher::deregisterImpl_(const OperatorHandle& op, const OperatorName& o
  cleanup(op, op_name);
 }

-RegistrationHandleRAII Dispatcher::registerName(OperatorName op_name) {
+RegistrationHandleRAII Dispatcher::registerName(const OperatorName& op_name) {
  std::lock_guard<std::mutex> lock(guard_->mutex);
  auto op = findOrRegisterName_(op_name);
  ++op.operatorDef_->def_and_impl_count;
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@ -13,15 +13,10 @@
 #include <condition_variable>
 #include <list>
 #include <mutex>
-#include <type_traits>

 #include <ATen/core/enum_tag.h>
 #include <ATen/core/grad_mode.h>

-#ifndef NDEBUG
-#include <iostream>
-#endif
-
 namespace c10 {

 TORCH_API bool show_dispatch_trace();
@ -255,7 +250,7 @@ class TORCH_API Dispatcher final {
  // NB: steals the inferred function schema, as we may need to hold on to
  // it for a bit until the real schema turns up
  RegistrationHandleRAII registerImpl(
-      OperatorName op_name,
+      const OperatorName& op_name,
      std::optional<DispatchKey> dispatch_key,
      KernelFunction kernel,
      std::optional<impl::CppSignature> cpp_signature,
@ -274,15 +269,15 @@ class TORCH_API Dispatcher final {
  /**
   * Given an operator, throws if we have a pystub.
   */
-  void throwIfHasPythonModule(OperatorName op_name);
+  void throwIfHasPythonModule(const OperatorName& op_name);

  std::optional<std::pair<const char*, const char*>> getPyStub(
-      OperatorName op_name);
+      const OperatorName& op_name);

  /**
   * Register a new operator by name.
   */
-  RegistrationHandleRAII registerName(OperatorName op_name);
+  RegistrationHandleRAII registerName(const OperatorName& op_name);

  /**
   * Register a fallback kernel for a backend.
@ -300,7 +295,9 @@ class TORCH_API Dispatcher final {
   * API.  These invocations are only permitted once per program, so we raise
   * an error if this is called again for the same namespace.
   */
-  RegistrationHandleRAII registerLibrary(std::string ns, std::string debug);
+  RegistrationHandleRAII registerLibrary(
+      const std::string& ns,
+      std::string debug);

  // ------------------------------------------------------------------------
  //
@ -448,8 +445,12 @@ class TORCH_API OperatorHandle {
  OperatorHandle& operator=(OperatorHandle&&) noexcept = default;
  OperatorHandle(const OperatorHandle&) = default;
  OperatorHandle& operator=(const OperatorHandle&) = default;
+#if defined(_WIN32)
  // NOLINTNEXTLINE(performance-trivially-destructible)
  ~OperatorHandle();
+#else
+  ~OperatorHandle() = default;
+#endif

  const OperatorName& operator_name() const {
    return operatorDef_->op.operator_name();
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@ -556,7 +556,7 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
  // real_type versus fake_type: in order to be compatible with FunctionSchema
  // parser, printing an argument with either MemoryFormat or Layout type should
  // give us the original schema string, hence printing out real_type.
-  auto type = arg.real_type();
+  const auto& type = arg.real_type();
  bool is_opt = type->kind() == OptionalType::Kind;
  auto unopt_type = is_opt ? type->castRaw<OptionalType>()->getElementType() : type;

--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -232,7 +232,7 @@ struct TORCH_API OptionalType : public UnionType {
  static TypePtr ofTensor();
  //
  // global singleton
-  static TypePtr get(TypePtr inner);
+  static TypePtr get(const TypePtr& inner);

 private:
  explicit OptionalType(const TypePtr& contained);
@ -895,7 +895,7 @@ struct TORCH_API ListType
  // the type List<T>.
  // The extra "identifier" argument is needed beccause we have multiple container types
  // that all re-use this function (List<T>, array<T, N>, etc.)
-  static TypePtr get(const std::string& identifier, TypePtr inner);
+  static TypePtr get(const std::string& identifier, const TypePtr& inner);

  // common cast List[Tensor]
  static ListTypePtr ofTensors();
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@ -274,7 +274,7 @@ ListTypePtr ListType::ofNumbers() {
  return value;
 }

-TypePtr OptionalType::get(TypePtr inner) {
+TypePtr OptionalType::get(const TypePtr& inner) {
  static ska::flat_hash_map<TypePtr, TypePtr> containerTypePtrs;
  static std::mutex mutex;
  // Perf from the lock is ok because this function is guarded behind
@ -287,7 +287,7 @@ TypePtr OptionalType::get(TypePtr inner) {
  return containerTypePtrs[inner];
 }

-TypePtr ListType::get(const std::string& identifier, TypePtr inner) {
+TypePtr ListType::get(const std::string& identifier, const TypePtr& inner) {
  static ska::flat_hash_map<std::tuple<std::string, TypePtr>, TypePtr> containerTypePtrs;
  static std::mutex mutex;
  // Perf from the lock is ok because this function is guarded behind
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1954,8 +1954,8 @@ void scaled_gemm(
  #if ROCM_VERSION >= 70000
            if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) {
                // TODO: add constraints based on hipblaslt internals
-                TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
-                           "Matrix dimensions must be multiples of 32 for MX format. "
+                TORCH_CHECK((m % 16 == 0) && (n % 16 == 0) && (k % 128 == 0),
+                           "M, N must be multiples of 16 and K should be multiple of 128 for MX format. "
                           "Got m=", m, ", n=", n, ", k=", k);
            }
  #endif
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@ -266,11 +266,14 @@ CUDAGeneratorImpl::CUDAGeneratorImpl(
 * See Note [Acquire lock when using random generators]
 */
 void CUDAGeneratorImpl::set_current_seed(uint64_t seed) {
-  at::cuda::assertNotCapturing(
-      "Cannot call CUDAGeneratorImpl::set_current_seed");
-  state_->seed_ = seed;
-  state_->philox_offset_per_thread_ = 0;
-  no_reset_rnn_state_.clear();
+  if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
+    state_->seed_ = seed;
+    state_->philox_offset_per_thread_ = 0;
+    no_reset_rnn_state_.clear();
+  } else {
+    TORCH_CHECK(state_->seed_ == seed, "CUDAGeneratorImpl::set_current_seed can be called during stream capture only if new seed is the same as the original seed.");
+    // no-op case
+  }
 }

 /**
@ -299,9 +302,6 @@ uint64_t CUDAGeneratorImpl::get_offset() const {
 * Gets the current seed of CUDAGeneratorImpl.
 */
 uint64_t CUDAGeneratorImpl::current_seed() const {
-  // Debatable if current_seed() should be allowed in captured regions.
-  // Conservatively disallow it for now.
-  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::current_seed");
  return state_->seed_;
 }

@ -346,8 +346,6 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
 * and size of the internal state.
 */
 void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
-  at::cuda::assertNotCapturing(
-      "Please ensure to utilize the CUDAGeneratorImpl::set_state_index method during capturing.");
  static const size_t seed_size = sizeof(uint64_t);
  static const size_t offset_size = sizeof(int64_t);
  static const size_t total_size = seed_size + offset_size;
@ -402,15 +400,27 @@ c10::intrusive_ptr<c10::GeneratorImpl> CUDAGeneratorImpl::graphsafe_get_state()
 */
 void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
  // see Note [Why enforce RNG offset % 4 == 0?]
+
+  // Note: If you use CUDNN RNN's, calling
+  // set_philox_offset_per_thread instead of set_offset will cause the
+  // cudnn RNN rng state to become stale.
  TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4");
-  state_->philox_offset_per_thread_ = offset;
+  if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
+    state_->philox_offset_per_thread_ = offset;
+  } else {
+    state_->offset_intragraph_ = offset;
+  }
 }

 /**
 * Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl.
 */
 uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const {
-  return state_->philox_offset_per_thread_;
+  if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
+    return state_->philox_offset_per_thread_;
+  } else {
+    return state_->offset_intragraph_;
+  }
 }

 /**
--- a/aten/src/ATen/cuda/detail/DeviceThreadHandles.h
+++ b/aten/src/ATen/cuda/detail/DeviceThreadHandles.h
@ -122,7 +122,7 @@ struct DeviceThreadHandlePool : public std::enable_shared_from_this<DeviceThread

    // Called by the destructor.  Releases this thread's handles back into the pool.
    void release() {
-        if(my_handles.size() > 0) {
+        if(!my_handles.empty()) {
            auto parent = weak_parent.lock();
            if (!parent) {
                // If this thread exits after atexit handlers have completed, the
--- a/aten/src/ATen/dlpack.h
+++ b/aten/src/ATen/dlpack.h
@ -19,7 +19,7 @@
 #define DLPACK_MAJOR_VERSION 1

 /*! \brief The current minor version of dlpack */
-#define DLPACK_MINOR_VERSION 0
+#define DLPACK_MINOR_VERSION 1

 /*! \brief DLPACK_DLL prefix for windows */
 #ifdef _WIN32
@ -32,9 +32,7 @@
 #define DLPACK_DLL
 #endif

-// NOLINTNEXTLINE(modernize-deprecated-headers)
 #include <stdint.h>
-// NOLINTNEXTLINE(modernize-deprecated-headers)
 #include <stddef.h>

 #ifdef __cplusplus
@ -159,6 +157,26 @@ typedef enum {
  kDLComplex = 5U,
  /*! \brief boolean */
  kDLBool = 6U,
+  /*! \brief FP8 data types */
+  kDLFloat8_e3m4 = 7U,
+  kDLFloat8_e4m3 = 8U,
+  kDLFloat8_e4m3b11fnuz = 9U,
+  kDLFloat8_e4m3fn = 10U,
+  kDLFloat8_e4m3fnuz = 11U,
+  kDLFloat8_e5m2 = 12U,
+  kDLFloat8_e5m2fnuz = 13U,
+  kDLFloat8_e8m0fnu = 14U,
+  /*! \brief FP6 data types
+   * Setting bits != 6 is currently unspecified, and the producer must ensure it is set
+   * while the consumer must stop importing if the value is unexpected.
+   */
+  kDLFloat6_e2m3fn = 15U,
+  kDLFloat6_e3m2fn = 16U,
+  /*! \brief FP4 data types
+   * Setting bits != 4 is currently unspecified, and the producer must ensure it is set
+   * while the consumer must stop importing if the value is unexpected.
+   */
+  kDLFloat4_e2m1fn = 17U,
 } DLDataTypeCode;

 /*!
@ -172,6 +190,12 @@ typedef enum {
 *   - int8: type_code = 0, bits = 8, lanes = 1
 *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
 *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
+ *   - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory)
+ *   - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory)
+ *   - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory)
+ *
+ *  When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e.,
+ *  for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element.
 */
 typedef struct {
  /*!
@ -229,12 +253,12 @@ typedef struct {
  /*! \brief The data type of the pointer*/
  DLDataType dtype;
  /*! \brief The shape of the tensor */
-  const int64_t* shape;
+  int64_t* shape;
  /*!
   * \brief strides of the tensor (in number of elements, not bytes)
   *  can be NULL, indicating tensor is compact and row-majored.
   */
-  const int64_t* strides;
+  int64_t* strides;
  /*! \brief The offset in bytes to the beginning pointer to data */
  uint64_t byte_offset;
 } DLTensor;
@ -269,7 +293,7 @@ typedef struct DLManagedTensor {
  void (*deleter)(struct DLManagedTensor * self);
 } DLManagedTensor;

-// bit masks used in in the DLManagedTensorVersioned
+// bit masks used in the DLManagedTensorVersioned

 /*! \brief bit mask to indicate that the tensor is read only. */
 #define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
@ -282,6 +306,14 @@ typedef struct DLManagedTensor {
 */
 #define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL)

+/*
+ * \brief bit mask to indicate that whether a sub-byte type is packed or padded.
+ *
+ * The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can
+ * be set by the producer to signal that a tensor of sub-byte type is padded.
+ */
+#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL)
+
 /*!
 * \brief A versioned and managed C Tensor object, manage memory of DLTensor.
 *
--- a/aten/src/ATen/functorch/ADInterpreters.cpp
+++ b/aten/src/ATen/functorch/ADInterpreters.cpp
@ -139,7 +139,7 @@ static void autogradBasedTransformSendToNext(
  std::bitset<default_bitset_size> outputs_aliasing_immutable; // set = 1 for all bits
  if(!grad_special_case) {
    for (auto idx = stack->size() - args_size; idx < stack->size(); idx++) {
-      const auto ivalue = (*stack)[idx];
+      const auto& ivalue = (*stack)[idx];
      if (!ivalue.isTensor()) {
        continue; // only input that can be aliased is a tensor, not a tensor list (expect in ops without returns)
      }
--- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
@ -6,6 +6,8 @@

 #include <ATen/functorch/BatchRulesHelper.h>

+#include <algorithm>
+
 namespace at::functorch {

 typedef std::tuple<Tensor, std::optional<int64_t>> oneOutput;
@ -315,7 +317,7 @@ oneOutput linalg_lu_solve_batch_rule(
  const auto LU_num_batch_dims = rankWithoutBatchDim(LU_, LU_bdim) - LU_min_rank;
  const auto pivots_num_batch_dims = rankWithoutBatchDim(pivots_, pivots_bdim) - pivots_min_rank;
  const auto B_num_batch_dims = rankWithoutBatchDim(B_, B_bdim) - B_min_rank;
-  const auto max_num_batch_dims = std::max(std::max(LU_num_batch_dims, pivots_num_batch_dims), B_num_batch_dims);
+  const auto max_num_batch_dims = std::max({LU_num_batch_dims, pivots_num_batch_dims, B_num_batch_dims});

  LU_ = maybePadToLogicalRank(LU_, LU_bdim, max_num_batch_dims + LU_min_rank);
  pivots_ = maybePadToLogicalRank(pivots_, pivots_bdim, max_num_batch_dims + pivots_min_rank);
--- a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
@ -171,6 +171,8 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {

  POINTWISE_BOXED(fill_.Scalar);
  POINTWISE_BOXED(zero_);
+  // This is special because this op doesn't return anything
+  m.impl("_assert_tensor_metadata", native::_assert_tensor_metadata);

 #undef UNARY_POINTWISE
 #undef UNARY_POINTWISE_ALL
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@ -897,11 +897,11 @@ Tensor& div_(Tensor& self, const Scalar& other) {
 }

 Tensor div(const Tensor& self, const Scalar& other, std::optional<std::string_view> rounding_mode) {
-  return self.div(wrapped_scalar_tensor(other), std::move(rounding_mode)); // redispatch!
+  return self.div(wrapped_scalar_tensor(other), rounding_mode); // redispatch!
 }

 Tensor& div_(Tensor& self, const Scalar& other, std::optional<std::string_view> rounding_mode) {
-  return self.div_(wrapped_scalar_tensor(other), std::move(rounding_mode)); // redispatch!
+  return self.div_(wrapped_scalar_tensor(other), rounding_mode); // redispatch!
 }

 // divide, alias for div
@ -926,23 +926,23 @@ Tensor& divide_(Tensor& self, const Scalar& other) {
 }

 Tensor& divide_out(const Tensor& self, const Tensor& other, std::optional<std::string_view> rounding_mode, Tensor& result) {
-  return at::div_out(result, self, other, std::move(rounding_mode));
+  return at::div_out(result, self, other, rounding_mode);
 }

 Tensor divide(const Tensor& self, const Tensor& other, std::optional<std::string_view> rounding_mode) {
-  return self.div(other, std::move(rounding_mode));
+  return self.div(other, rounding_mode);
 }

 Tensor& divide_(Tensor& self, const Tensor& other, std::optional<std::string_view> rounding_mode) {
-  return self.div_(other, std::move(rounding_mode));
+  return self.div_(other, rounding_mode);
 }

 Tensor divide(const Tensor& self, const Scalar& other, std::optional<std::string_view> rounding_mode) {
-  return self.div(other, std::move(rounding_mode));
+  return self.div(other, rounding_mode);
 }

 Tensor& divide_(Tensor& self, const Scalar& other, std::optional<std::string_view> rounding_mode) {
-  return self.div_(other, std::move(rounding_mode));
+  return self.div_(other, rounding_mode);
 }

 // true_divide, an alias for div
--- a/aten/src/ATen/native/ChanelShuffle.cpp
+++ b/aten/src/ATen/native/ChanelShuffle.cpp
@ -81,7 +81,7 @@ Tensor math_channel_shuffle(const Tensor& self, int64_t groups) {
  // TODO: contiguous can be made to preserve the memory format
  // of the input. However since the above reshape clobbers h and w
  // it may not be safe to do that, since channels_last contiguous
-  // may think oc and and the last dim correspond to h,w?
+  // may think oc and the last dim correspond to h,w?
  // It is not clear, however from initial looking around it feels that
  // this may not be correct.
  // In this case channels last will likely require custom implementation
--- a/aten/src/ATen/native/EmbeddingBag.h
+++ b/aten/src/ATen/native/EmbeddingBag.h
@ -1,3 +1,4 @@
+#pragma once
 #include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
 #include <cstdint>
--- a/aten/src/ATen/native/FractionalMaxPool3d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool3d.cpp
@ -67,13 +67,13 @@ TORCH_PRECOMPUTE_META_FUNC(fractional_max_pool3d)(
  int64_t inputH = input_.size(heightDim);
  int64_t inputW = input_.size(widthDim);

-  TORCH_CHECK(outputT + poolSizeT - 1 < inputT,
+  TORCH_CHECK((poolSizeT <= inputT) && (outputT + poolSizeT - 1 < inputT),
           "fractional_max_pool3d_out(): pool time ", poolSizeT,
           " too large relative to input time ", inputT);
-  TORCH_CHECK(outputW + poolSizeW - 1 < inputW,
+  TORCH_CHECK((poolSizeW <= inputW) && (outputW + poolSizeW - 1 < inputW),
           "fractional_max_pool3d_out(): pool width ", poolSizeW,
           " too large relative to input width ", inputW);
-  TORCH_CHECK(outputH + poolSizeH - 1 < inputH,
+  TORCH_CHECK((poolSizeH <= inputH) && (outputH + poolSizeH - 1 < inputH),
           "fractional_max_pool3d_out(): pool height ", poolSizeH,
           " too large relative to input height ", inputH);

--- a/aten/src/ATen/native/Histogram.cpp
+++ b/aten/src/ATen/native/Histogram.cpp
@ -150,7 +150,7 @@ void histogramdd_prepare_out(const Tensor& input, const std::vector<int64_t>& bi
 void histogramdd_prepare_out(const Tensor& input, TensorList bins,
        const Tensor& hist, const TensorList& bin_edges) {
    std::vector<int64_t> bin_ct(bins.size());
-    std::transform(bins.begin(), bins.end(), bin_ct.begin(), [](Tensor t) { return t.numel() - 1; });
+    std::transform(bins.begin(), bins.end(), bin_ct.begin(), [](const Tensor& t) { return t.numel() - 1; });
    histogramdd_prepare_out(input, bin_ct, hist, bin_edges);
 }

--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@ -360,7 +360,7 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
  // to compute the number of dimensions covered by ellipsis.
  for(const auto i : c10::irange(num_ops)) {
    const auto& operand = operands[i];
-    const auto labels = op_labels[i];
+    const auto& labels = op_labels[i];
    const auto ndims = operand.dim();
    int64_t nlabels = static_cast<int64_t>(labels.size());
    bool has_ellipsis = false;
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@ -237,7 +237,7 @@ TORCH_META_FUNC(linalg_vector_norm)(const Tensor& self, const Scalar& scalar_ord
  at::detail::check_linalg_norm_dtype(opt_dtype, self.scalar_type(), "linalg.vector_norm");

  auto mask = at::native::make_dim_mask(dim, self.dim());
-  auto shape = at::native::shape_from_dim_mask(self, std::move(mask), keepdim);
+  auto shape = at::native::shape_from_dim_mask(self, mask, keepdim);
  auto options = self.options()
                     .dtype(toRealValueType(opt_dtype.value_or(self.scalar_type())));

@ -641,7 +641,7 @@ namespace {
 Tensor linalg_matrix_power_impl(
    const Tensor& self,
    int64_t n,
-    std::optional<Tensor> _out) {
+    const std::optional<Tensor>& _out) {
  NoTF32Guard disable_tf32;
  auto out = _out.value_or(Tensor());

@ -1019,7 +1019,7 @@ Tensor multi_dot_impl(TensorList _tensors, std::optional<Tensor> _out) {
  Tensor result;

  if (_out.has_value()) {
-    auto out = *_out;
+    const auto& out = *_out;
    TORCH_CHECK(
        dtype == out.dtype(),
        "multi_dot(): expected out tensor to have dtype ",
--- a/aten/src/ATen/native/LossCTC.cpp
+++ b/aten/src/ATen/native/LossCTC.cpp
@ -493,7 +493,7 @@ Tensor get_clamped_target_length(
 // the gradient is implemented for _cudnn_ctc_loss (just in derivatives.yaml) and _ctc_loss and this function has automatic gradients
 // it also handles the reduction if desired
 template <typename LengthsType>
-Tensor ctc_loss_impl(const Tensor& log_probs_, const Tensor& targets, LengthsType input_lengths, LengthsType target_lengths, int64_t BLANK, int64_t reduction, bool zero_infinity) {
+Tensor ctc_loss_impl(const Tensor& log_probs_, const Tensor& targets, const LengthsType& input_lengths, const LengthsType& target_lengths, int64_t BLANK, int64_t reduction, bool zero_infinity) {
  auto is_batched = log_probs_.dim() == 3;
  Tensor log_probs = is_batched ? log_probs_ : log_probs_.unsqueeze(1);
  bool use_cudnn =
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@ -599,7 +599,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
    check_dims_match_num_input_features("weight", num_features, weight.sym_numel());
  }
  if (bias.defined()) {
-    check_dims_match_num_input_features("bias", std::move(num_features), bias.sym_numel());
+    check_dims_match_num_input_features("bias", num_features, bias.sym_numel());
  }

  BatchNormBackend backend = _select_batch_norm_backend(input, weight, bias, running_mean, running_var, training, eps);
@ -923,7 +923,7 @@ std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_no_stats_cpu(
 std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_no_training(
    const Tensor& self, const std::optional<Tensor>& weight_opt, const std::optional<Tensor>& bias_opt,
    const Tensor& running_mean, const Tensor& running_var, double momentum, double eps) {
-  return at::_native_batch_norm_legit(self, weight_opt, bias_opt, const_cast<Tensor&>(running_mean), const_cast<Tensor&>(running_var), /*train=*/false, momentum, eps);
+  return at::_native_batch_norm_legit(self, weight_opt, bias_opt, const_cast<Tensor&>(running_mean), const_cast<Tensor&>(running_var), /*training=*/false, momentum, eps);
 }


--- a/aten/src/ATen/native/PadNd.cpp
+++ b/aten/src/ATen/native/PadNd.cpp
@ -73,7 +73,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value)
    for (const auto i : c10::irange((size_t)l_pad)) {
        auto pad_idx = pad.size() - ((i + 1) * 2);
        auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
-        TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
+        TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
                 pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, "
                 "which is invalid. Check dimension ", l_diff + i, " of your input.");
        new_shape.emplace_back(new_dim);
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@ -1533,7 +1533,7 @@ std::tuple<Tensor, Tensor> lstm_cell(
  check_rnn_cell_forward_input(input, w_ih.sym_size(1));
  auto hidden_size = w_hh.sym_size(1);
  check_rnn_cell_forward_hidden(input, hx[0], hidden_size, 0);
-  check_rnn_cell_forward_hidden(input, hx[1], std::move(hidden_size), 1);
+  check_rnn_cell_forward_hidden(input, hx[1], hidden_size, 1);
  static at::Tensor undefined;
  return LSTMCell<CellParams>{}(input, std::make_tuple(hx[0], hx[1]), CellParams{w_ih, w_hh, b_ih, b_hh, undefined});
 }
@ -1612,13 +1612,13 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_differentiable_gru_cell
    h_g = h_g + hidden_bias;
  }
  auto chunked_input_gates = in_g.unsafe_chunk(3, 1);
-  Tensor ir = chunked_input_gates[0];
-  Tensor ii = chunked_input_gates[1];
-  Tensor in = chunked_input_gates[2];
+  const Tensor& ir = chunked_input_gates[0];
+  const Tensor& ii = chunked_input_gates[1];
+  const Tensor& in = chunked_input_gates[2];
  auto chunked_hidden_gates = h_g.unsafe_chunk(3, 1);
-  Tensor hr = chunked_hidden_gates[0];
-  Tensor hi = chunked_hidden_gates[1];
-  Tensor hn = chunked_hidden_gates[2];
+  const Tensor& hr = chunked_hidden_gates[0];
+  const Tensor& hi = chunked_hidden_gates[1];
+  const Tensor& hn = chunked_hidden_gates[2];
  Tensor rg = (ir + hr).sigmoid();
  Tensor ig = (ii + hi).sigmoid();
  Tensor grad_hx = grad_hy * ig;
--- a/aten/src/ATen/native/TriangularOps.cpp
+++ b/aten/src/ATen/native/TriangularOps.cpp
@ -52,6 +52,7 @@ void apply_triu_tril_single(
    int64_t self_col_stride,
    bool upper) {
  constexpr int64_t zero = 0;
+  k = std::clamp(k, -n, m); // Clamp k to [-n, m] to prevent i + k arithmetic overflow, especially if k approaches INT64_MAX/INT64_MIN.

  if (upper) {
    parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@ -409,17 +409,17 @@ static inline Tensor& unary_op_impl_out(Tensor& result, const Tensor& self, Stub
 }

 template <typename Stub, typename ...Args>
-static inline Tensor& unary_op_impl_float_out(Tensor& result, const Tensor& self, Stub& stub, Args... args) {
+static inline Tensor& unary_op_impl_float_out(Tensor& result, const Tensor& self, Stub& stub, Args&&... args) {
  auto iter = TensorIterator::unary_float_op(result, self);
-  stub(iter.device_type(), iter, args...);
+  stub(iter.device_type(), iter, std::forward<Args>(args)...);
  return result;
 }

 template <typename Stub, typename ...Args>
-static inline Tensor unary_op_impl_float(const Tensor& self, Stub& stub, Args... args) {
+static inline Tensor unary_op_impl_float(const Tensor& self, Stub& stub, Args&&... args) {
  Tensor result;
  auto iter = TensorIterator::unary_float_op(result, self);
-  stub(iter.device_type(), iter, args...);
+  stub(iter.device_type(), iter, std::forward<Args>(args)...);
  return iter.output();
 }

--- a/aten/src/ATen/native/Unique.cpp
+++ b/aten/src/ATen/native/Unique.cpp
@ -323,7 +323,7 @@ std::tuple<Tensor, Tensor, Tensor> unique_consecutive_cpu_template(

 template<class ForwardIt>
 ForwardIt _unique_dim_cpu_impl(ForwardIt first, ForwardIt last,
-  std::vector<int64_t>& indices, Tensor inverse_indices_vec, Tensor counts) {
+  std::vector<int64_t>& indices, const Tensor& inverse_indices_vec, const Tensor& counts) {
    if (first == last) {
      return last;
    }
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
@ -24,7 +24,7 @@ constexpr int64_t num_output_channels_index [[maybe_unused]] = 10;
 constexpr int64_t num_input_channels_index [[maybe_unused]] = 11;

 template <typename TENSOR_DTYPE, typename VEC_DTYPE>
-std::vector<VEC_DTYPE> unwrap_vector(at::Tensor tensor) {
+std::vector<VEC_DTYPE> unwrap_vector(const at::Tensor& tensor) {
  std::vector<VEC_DTYPE> vec(tensor.numel());
  TENSOR_DTYPE* tensor_data_ptr = tensor.data_ptr<TENSOR_DTYPE>();
  std::copy(tensor_data_ptr, tensor_data_ptr + tensor.numel(), vec.data());
@ -39,7 +39,7 @@ std::vector<VEC_DTYPE> unwrap_vector(at::Tensor tensor) {
 */
 void unpack_bcsr(
    int8_t* dst,
-    ao::sparse::BCSR bcsr,
+    const ao::sparse::BCSR& bcsr,
    const int64_t R,
    const int64_t C,
    const int64_t RB,
--- a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
@ -85,11 +85,11 @@ void cpu_max_unpool(
    if constexpr (is_3d) {
      TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
          " (output volumes are of size ", output_depth,
-          "x", output_height, "x", output_width);
+          "x", output_height, "x", output_width, ")");
    } else {
      TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
          " (output volumes are of size ", output_height,
-          "x", output_width);
+          "x", output_width, ")");
    }
  }

--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -1138,9 +1138,14 @@ bool is_blockwise_1x16_scaling(const at::Tensor& t, const at::Tensor& scale) {
 bool is_blockwise_1x32_scaling(const at::Tensor& t, const at::Tensor& scale) {
  // TODO: We might want to enforce some structure on the shapes of the scale
  // tensors
-  return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
-      && scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4)
-      && scale.is_contiguous());
+  bool is_fp8_path = (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
+      && scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4));
+  bool is_packed_fp4_path = false;
+#ifdef USE_ROCM
+  is_packed_fp4_path = (t.scalar_type() == ScalarType::Float4_e2m1fn_x2 && scale.scalar_type() == at::kFloat8_e8m0fnu
+      && scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1) * 2, 32), 4));
+#endif
+  return (is_fp8_path || is_packed_fp4_path) && scale.is_contiguous();
 }

 bool is_blockwise_1x128_scaling(const at::Tensor& t, const at::Tensor& scale) {
@ -1381,9 +1386,15 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
    TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
                "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");

-    TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
-                mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
-                "Matrix dimensions must be multiples of 32 for block-wise scaling");
+    int packed_factor = 1;
+    if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
+      // For float4 data type, each byte stores two 4-bit floating-point values,
+      // effectively packing two elements into one byte.
+      packed_factor = 2;
+    }
+    TORCH_CHECK(mat1.size(0) % 16 == 0 && (mat1.size(1) * packed_factor) % 128 == 0 &&
+                mat2.size(1) % 16 == 0,
+                "M, N must be multiples of 16 and K must be multiple of 128 for block-wise scaling");

    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
                out.scalar_type() == ScalarType::Half,
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@ -416,6 +416,7 @@ struct ReduceOp {
    if (config.should_block_y_reduce()) {
      value = block_y_reduce<output_vec_size>(value, shared_memory);
    }
+    __syncthreads();
    if (config.should_block_x_reduce()) {
      value = block_x_reduce<output_vec_size>(value, shared_memory);
    }
--- a/aten/src/ATen/native/cuda/Repeat.cu
+++ b/aten/src/ATen/native/cuda/Repeat.cu
@ -17,12 +17,11 @@ __global__ static void compute_cuda_kernel(
    index_t* result_ptr,
    int64_t size,
    int64_t result_size) {
-  if (C10_UNLIKELY((result_size != cumsum_ptr[size - 1]))) {
-    printf("%s:%d:%s: block: [%d,%d,%d], thread: [%d,%d,%d] "
+  CUDA_KERNEL_ASSERT_PRINTF(
+      result_size == cumsum_ptr[size - 1],
      "Invalid input! In `repeat_interleave`, the `output_size` argument (%ld) must be the same as the sum of the elements in the `repeats` tensor (%ld).\n",
-      __FILE__, __LINE__, __func__,blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, result_size, cumsum_ptr[size - 1 ]);
-    CUDA_KERNEL_ASSERT(result_size == cumsum_ptr[size - 1])
-  }
+      result_size,
+      cumsum_ptr[size - 1]);

  int64_t idx = ((int64_t) blockIdx.x) * blockDim.x + threadIdx.x;
  int64_t stride = (blockDim.x * gridDim.x) / C10_WARP_SIZE;
--- a/aten/src/ATen/native/cuda/Shape.cu
+++ b/aten/src/ATen/native/cuda/Shape.cu
@ -226,6 +226,38 @@ __global__ void CatArrayBatchedCopy_contig(
    }
 }

+
+template <typename T, typename IndexType, int Dims, int batch_size, int stride_size, int alignment, int elems_per_vec>
+__global__ void CatArrayBatchedCopy_vectorized(
+    char* output,
+    CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs,
+    TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
+    const int concatDim,
+    IndexType trailingSize) {
+
+    IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
+    IndexType nElements = inputs.nElements[blockIdx.y] / elems_per_vec;
+
+    if(tid >= nElements) return;
+
+    const char * data = (char*)inputs.input[blockIdx.y];
+    IndexType offset = inputs.offset[blockIdx.y] * trailingSize / elems_per_vec;
+    IndexType dimSize = inputs.dimSize[blockIdx.y] * trailingSize / elems_per_vec;
+    int64_t dataOffset = (int64_t)offset  * alignment; // in bytes
+
+    IndexType stride = gridDim.x * blockDim.x;
+
+    while( tid < nElements){
+      int64_t elementOffset = (int64_t)CatArrIndexToOffset<IndexType, Dims>::compute(
+                    os.tensorSize, os.tensorStride, dimSize, concatDim, tid) * alignment; // in bytes
+      auto vec = at::native::memory::ld_vec<alignment>(data + (int64_t)alignment * tid);
+      at::native::memory::st_vec<alignment>(output + dataOffset + elementOffset, vec);
+      tid += stride;
+    }
+}
+
+
+
 /*
  Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads
  to improve memory bandwidth throughput.
@ -296,12 +328,27 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
  scalar_t *data = (scalar_t *)(out.mutable_data_ptr());
  CatArrInputTensorMetadata<scalar_t, unsigned int, batch_size, stride_size> catMetaData;
  TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam;
+  // If all batches are contiguous we can call a specialized implementation
+  // which requires the input tensor addresses to be aligned to a
+  // 16 Byte boundary.
+
+  constexpr bool isContig = stride_size == 1;
+  bool isAligned = true;
+  constexpr int alignment = 16;

  // Next, let's initialize the size, stride arrays for the output Tensor.
+  // for contig case, we'll canonicalize output strides, so that
+  // we don't have arbitrary strides for dims of size 0
+  size_t stride0 = 1;
  if (memory_format == c10::MemoryFormat::Contiguous) {
-    for (int i = 0; i < nDims; ++i) {
+    for (int i = nDims - 1; i >= 0; --i) {
      outputParam.tensorSize[i] = out.size(i);
-      outputParam.tensorStride[i] = out.stride(i);
+      if (isContig) {
+        outputParam.tensorStride[i] = stride0;
+        stride0 *= out.size(i);
+      } else {
+        outputParam.tensorStride[i] = out.stride(i);
+      }
    }
  } else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) {
    // permute the semantics of dims from NCHW to NHWC so that the input
@ -320,12 +367,15 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i

  at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();

-  // If all batches are contiguous we can call a specialized implementation
-  // which requires the input tensor addresses to be aligned to a
-  // 16 Byte boundary.

-  bool isContig = true;
-  bool isAligned = true;
+  // for channels last computing slice size correctly is much more involved, so we never send it
+  // on the fully vectorized path
+  // we need output stride in cat dimension to be multiple of alignment,
+  // if we ever use it to compute offsets
+  // for catting in 0th dimension it doesn't matter
+  bool isInOutAligned = isContig && at::native::memory::get_alignment(data) >= alignment &&
+                        memory_format == c10::MemoryFormat::Contiguous && (dimension == 0 ||
+                        outputParam.tensorStride[dimension - 1] * sizeof(scalar_t) % alignment == 0);
  unsigned int max_elements_per_tensor = 0;

  // Now we loop
@ -341,6 +391,16 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
      // high-dimensional tensor
      if (inputs[i+batchCounter].get().numel() > 0) {
        dimSize = inputs[i+batchCounter].get().size(dimension);
+        if (isInOutAligned) {
+          auto t = inputs[i+batchCounter].get();
+          // similarly to output stride, we cannot trust stride value to
+          // determine slice size if the corresponding dimension is 1
+          // we have to multiply all the subsequent sizes
+          int64_t slice_size = dimension == 0 ? t.numel() : t.sizes()[dimension - 1] != 1 ?
+             t.strides()[dimension - 1] : c10::multiply_integers(t.sizes().begin() + dimension, t.sizes().end());
+          slice_size *= sizeof(scalar_t);
+          isInOutAligned &= (slice_size % alignment == 0);
+        }
      }

      catMetaData.input[batchCounter] = (scalar_t*)(inputs[i+batchCounter].get().const_data_ptr());
@ -351,10 +411,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
 #ifdef USE_ROCM
      // On ROCm, CatArrayBatchedCopy_contig is faster
      isAligned = false;
+      isInOutAligned = false;
 #else
      // If at least one of the inputs is not aligned, we can't call the
      // CatArrayBatchedCopy_alignedK_contig
      isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]);
+      isInOutAligned &= at::native::memory::get_alignment(catMetaData.input[batchCounter]) >= alignment;
 #endif

      if (stride_size > 1) {
@ -365,7 +427,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
          catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j];
        }
        catMetaData.isContiguous[batchCounter] = false;
-        isContig = false;
      } else {
        catMetaData.isContiguous[batchCounter] = true;
      }
@ -388,10 +449,13 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
          max_elements_per_tensor, batchCounter);
 #else
    dim3 applyBlock, catGrid;
-    if (isContig && sizeof(scalar_t) > 2) {
+    if (isInOutAligned) {
+      std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, alignment>(
+        max_elements_per_tensor, batchCounter);
+    } else if (isContig && isAligned && sizeof(scalar_t) > 2) {
      std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_16>(
          max_elements_per_tensor, batchCounter);
-    } else if (isContig && sizeof(scalar_t) == 2) {
+    } else if (isContig && isAligned && sizeof(scalar_t) == 2) {
      std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_8>(
          max_elements_per_tensor, batchCounter);
    } else {
@ -399,6 +463,30 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
      getCatGrid(batchCounter, catGrid);
    }
 #endif
+    int32_t trailingSize;
+    TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
+    if (isInOutAligned) {
+      // in this case we can and should flatten the tensors after the cat dim
+      // we want to view the tensors as if consisting of `alignment`-sized elements
+      // however, we might not be able to cleanly divide just the last dim -
+      // it might not be the multiple of alignment.
+      // however, we know that the full concatted slice is multiple of alignment,
+      // so if we flatten all the dims after and including concat dim,
+      // it will be divisible by alignment
+      // then we need to divide last out size by elems_per_vec,
+      // and divide all strides except last by elems_per_vec (last stride is 1 always)
+      // for input, we will fix up the sizes and strides in the kernel directly
+      kernelOutputParam = outputParam;
+      nDims = dimension + 1;
+      constexpr auto elems_per_vec = alignment / sizeof(scalar_t);
+      auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1];
+      kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec;
+      trailingSize = outputParam.tensorStride[dimension];
+      kernelOutputParam.tensorStride[dimension] = 1;
+      for (int i = 0; i < dimension; ++i) {
+        kernelOutputParam.tensorStride[i] /= elems_per_vec;
+      }
+    }

    if (memory_format != c10::MemoryFormat::Contiguous) {
      switch (dimension) {
@ -413,7 +501,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
    }
    // Template Declarations for dim = 1, 2, 3, 4
 #define HANDLE_CASE(DIMS) \
-    if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
+    if (isInOutAligned) {\
+      constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \
+      CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\
+      catGrid, applyBlock, 0, stream.stream()>>>(\
+        (char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\
+    } else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
      CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
          catGrid, applyBlock, 0, stream.stream()>>>(\
              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
--- a/aten/src/ATen/native/cuda/int8mm.cu
+++ b/aten/src/ATen/native/cuda/int8mm.cu
@ -5,12 +5,20 @@

 namespace at::native {

-__global__ void weight_int8pack_mm_kernel(const float* x, const int8_t* w, const float* scale, float* out, int B, int K, int N) {
+__global__ void weight_int8pack_mm_kernel(
+    const float* x,
+    const int8_t* w,
+    const float* scale,
+    float* out,
+    int B,
+    int K,
+    int N) {
  // one thread per output element: [B, N]
  int b = blockIdx.y * blockDim.y + threadIdx.y;
  int n = blockIdx.x * blockDim.x + threadIdx.x;

-  if (b >= B || n >= N) return;
+  if (b >= B || n >= N)
+    return;

  float acc = 0.0f;
  for (int k = 0; k < K; ++k) {
@ -20,7 +28,11 @@ __global__ void weight_int8pack_mm_kernel(const float* x, const int8_t* w, const
  out[b * N + n] = acc * scale[n];
 }

-void launch_weight_int8pack_mm_cuda_kernel(const Tensor& x, const Tensor& w_int8, const Tensor& scale, Tensor& out) {
+void launch_weight_int8pack_mm_cuda_kernel(
+    const Tensor& x,
+    const Tensor& w_int8,
+    const Tensor& scale,
+    Tensor& out) {
  const int B = x.size(0);
  const int K = x.size(1);
  const int N = w_int8.size(0);
@ -35,12 +47,16 @@ void launch_weight_int8pack_mm_cuda_kernel(const Tensor& x, const Tensor& w_int8
      w_int8.data_ptr<int8_t>(),
      scale.data_ptr<float>(),
      out.data_ptr<float>(),
-      B, K, N);
+      B,
+      K,
+      N);
 }

-
 // Main GPU entry point
-at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int8, const at::Tensor& scale) {
+at::Tensor _weight_int8pack_mm_cuda(
+    const at::Tensor& x,
+    const at::Tensor& w_int8,
+    const at::Tensor& scale) {
  // --- Check inputs ---
  TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor");
  TORCH_CHECK(w_int8.is_cuda(), "w must be a CUDA tensor");
@ -50,12 +66,16 @@ at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int
  TORCH_CHECK(w_int8.dim() == 2, "w must be 2D");
  TORCH_CHECK(scale.dim() == 1, "scale must be 1D");

-  TORCH_CHECK(x.size(1) == w_int8.size(1), "K dimension mismatch: x.size(1) != w.size(1)");
-  TORCH_CHECK(w_int8.size(0) == scale.size(0), "Output dim mismatch: w.size(0) != scale.size(0)");
+  TORCH_CHECK(
+      x.size(1) == w_int8.size(1),
+      "K dimension mismatch: x.size(1) != w.size(1)");
+  TORCH_CHECK(
+      w_int8.size(0) == scale.size(0),
+      "Output dim mismatch: w.size(0) != scale.size(0)");

  // --- Determine shapes ---
-  auto B = x.size(0);  // batch size
-  auto N = w_int8.size(0);  // output dim
+  auto B = x.size(0); // batch size
+  auto N = w_int8.size(0); // output dim

  // Ensure inputs are in the correct types for the kernel
  auto x_f32 = x.to(at::kFloat);
@ -63,12 +83,13 @@ at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int
  auto scale_f32 = scale.to(at::kFloat);

  // --- Allocate output ---
-  auto out = at::empty({B, N}, x.options().dtype(at::kFloat));
+  auto out = at::empty({B, N}, x_f32.options());

  // --- Launch kernel ---
-  launch_weight_int8pack_mm_cuda_kernel(x_f32, w_int8_contiguous, scale_f32, out);
+  launch_weight_int8pack_mm_cuda_kernel(
+      x_f32, w_int8_contiguous, scale_f32, out);

-  return out;
+  return out.to(x.dtype());
 }

 } // namespace at::native
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@ -482,7 +482,9 @@ auto build_graph(
  auto scaled_dot_product_flash_attention_options =
      fe::graph::SDPA_attributes()
          .set_name("CUDNN_SDPA")
-          .set_generate_stats(return_softmaxstats)
+          .set_is_inference(return_softmaxstats == false)
+          // TODO(eqy): switch to this API once cuDNN FE is upgraded
+          // .set_generate_stats(return_softmaxstats)
          .set_causal_mask(is_causal)
          .set_attn_scale(attn_scale);
  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
@ -702,7 +704,9 @@ auto build_graph_nestedtensor(
  auto scaled_dot_product_flash_attention_options =
      fe::graph::SDPA_attributes()
          .set_name("CUDNN_SDPA_NESTEDTENSOR")
-          .set_generate_stats(return_softmaxstats)
+          .set_is_inference(return_softmaxstats == false)
+          // TODO(eqy): switch to this API once cuDNN FE is upgraded
+          // .set_generate_stats(return_softmaxstats)
          .set_causal_mask(is_causal)
          .set_attn_scale(attn_scale)
          .set_seq_len_q(SEQ_LEN_Q_)
--- a/aten/src/ATen/native/im2col_shape_check.h
+++ b/aten/src/ATen/native/im2col_shape_check.h
@ -2,6 +2,7 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/div_rtn.h>
+#include <c10/util/safe_numerics.h>

 namespace at::native {

@ -54,6 +55,14 @@ inline void col2im_shape_check(

  int64_t batch_dim = (ndim == 3) ? 0 : -1;
  int64_t n_input_plane = input.size(batch_dim + 1);
+  uint64_t prod_kernel_size = 1;
+
+  TORCH_CHECK(!c10::mul_overflows(static_cast<uint64_t>(kernel_width), static_cast<uint64_t>(kernel_height), &prod_kernel_size),
+            "Given kernel_width = ",
+            kernel_width,
+            " and kernel_height = ",
+            kernel_height,
+            " the product of kernel_width and kernel_height overflowed.");

  if (n_input_plane % (kernel_width * kernel_height) != 0) {
    TORCH_CHECK(false,
--- a/aten/src/ATen/native/layer_norm.h
+++ b/aten/src/ATen/native/layer_norm.h
@ -35,7 +35,7 @@ C10_ALWAYS_INLINE void _check_rms_norm_inputs_symint(
    std::stringstream ss;
    ss << "Given normalized_shape=" << normalized_shape
       << ", expected input with shape [*";
-    for (auto size : normalized_shape) {
+    for (const auto& size : normalized_shape) {
      ss << ", " << size;
    }
    ss << "], but got input of size" << input_shape;
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@ -1770,10 +1770,12 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> miopen_depthwise_convolution_back
 // fusions
 // ---------------------------------------------------------------------

-void raw_miopen_convolution_relu_out(
+void raw_miopen_convolution_add_relu_out(
    const Tensor& output,
    const Tensor& input,
    const Tensor& weight,
+    const Tensor& z,
+    float alpha,
    const Tensor& bias,
    IntArrayRef stride,
    IntArrayRef padding,
@ -1781,68 +1783,20 @@ void raw_miopen_convolution_relu_out(
    int64_t groups,
    bool benchmark,
    bool deterministic) {
-  auto dataType = getMiopenDataType(input);
-  miopenConvolutionMode_t c_mode = miopenConvolution;
-  ConvolutionArgs args{ input, output, weight };
-  args.handle = getMiopenHandle();
-  at::MemoryFormat memory_format = miopen_conv_suggest_memory_format(input, weight);
-  setConvolutionParams(
-      &args.params,
-      args.handle,
+  raw_miopen_convolution_forward_out(
+      output,
      input,
      weight,
      padding,
      stride,
      dilation,
      groups,
-      deterministic,
-      memory_format);
-  args.idesc.set(input, memory_format);
-  args.wdesc.set(weight, memory_format, 0);
-  args.odesc.set(output, memory_format);
-  args.cdesc.set(
-      dataType,
-      c_mode,
-      input.dim() - 2,
-      args.params.padding,
-      args.params.stride,
-      args.params.dilation,
-      args.params.groups,
      benchmark,
      deterministic);
-
-  TensorDescriptor bdesc;
-  bdesc.set(bias.expand({1, bias.size(0)}), output.dim());
-
-  // Create the fusion plan
-  miopenFusionPlanDescriptor_t fusePlanDesc;
-  miopenFusionOpDescriptor_t convoOp;
-  miopenFusionOpDescriptor_t biasOp;
-  miopenFusionOpDescriptor_t activOp;
-  MIOPEN_CHECK(miopenCreateFusionPlan(&fusePlanDesc, miopenVerticalFusion, args.idesc.desc()));
-  MIOPEN_CHECK(miopenCreateOpConvForward(fusePlanDesc, &convoOp, args.cdesc.desc(), args.wdesc.desc()));
-  MIOPEN_CHECK(miopenCreateOpBiasForward(fusePlanDesc, &biasOp, bdesc.desc()));
-  MIOPEN_CHECK(miopenCreateOpActivationForward(fusePlanDesc, &activOp, miopenActivationRELU));
-
-  // compile fusion plan
-  MIOPEN_CHECK(miopenCompileFusionPlan(args.handle, fusePlanDesc));
-
-  // Set the Args
-  float alpha = static_cast<float>(1);
-  float beta = static_cast<float>(0);
-  float activ_alpha = static_cast<float>(0);
-  float activ_beta = static_cast<float>(0);
-  float activ_gamma = static_cast<float>(0);
-  miopenOperatorArgs_t fusionArgs;
-  MIOPEN_CHECK(miopenCreateOperatorArgs(&fusionArgs));
-  MIOPEN_CHECK(miopenSetOpArgsConvForward(fusionArgs, convoOp, &alpha, &beta, weight.const_data_ptr()));
-  MIOPEN_CHECK(miopenSetOpArgsBiasForward(fusionArgs, biasOp, &alpha, &beta, bias.const_data_ptr()));
-  MIOPEN_CHECK(miopenSetOpArgsActivForward(fusionArgs, activOp, &alpha, &beta, activ_alpha, activ_beta, activ_gamma));
-
-  miopenExecuteFusionPlan(args.handle, fusePlanDesc, args.idesc.desc(), input.const_data_ptr(), args.odesc.desc(), output.data_ptr(), fusionArgs);
-
-  // Cleanup
-  miopenDestroyFusionPlan(fusePlanDesc);
+  at::Tensor alpha_mul_z_add_bias =
+      at::native::reshape_bias(input.dim(), bias).add(z, alpha);
+  output.add_(alpha_mul_z_add_bias);
+  output.relu_();
 }

 static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat memory_format) {
@ -1855,171 +1809,107 @@ static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat m
 Tensor miopen_convolution_add_relu(
    const Tensor& input_t,
    const Tensor& weight_t,
-    const Tensor& z,
+    const Tensor& z_t,
    const std::optional<Scalar>& alpha,
-    const std::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias_t,
    IntArrayRef stride,
    IntArrayRef padding,
    IntArrayRef dilation,
    int64_t groups) {
-
-  // MIOpen does not support fusion of add, the alpha2 * z step of the below cuDNN function:
-  // y = act ( alpha1 * conv(x) + alpha2 * z + bias )
-
  auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
+  const Tensor input = input_t.contiguous(memory_format);
+  const Tensor weight = weight_t.contiguous(memory_format);
+  Tensor z = z_t;
+  if (z.suggest_memory_format() != memory_format) {
+    z = z.to(memory_format);
+  }
+  z = z.contiguous(memory_format);
+
+  // FuseFrozenConvAddRelu performs some tensor shape checking
+  Tensor output_t = at::detail::empty_cuda(
+      conv_output_size(
+          input.sizes(), weight.sizes(), padding, stride, dilation),
+      input.options().memory_format(memory_format));
+  if (output_t.numel() == 0) {
+    return output_t;
+  }

  auto& ctx = at::globalContext();
  bool benchmark = ctx.benchmarkCuDNN();
+  auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
+  auto _bias = bias_t.has_value()
+      ? bias_t.value()
+      : at::zeros(
+            {output_t.size(1)},
+            optTypeMetaToScalarType(output_t.options().dtype_opt()),
+            output_t.options().layout_opt(),
+            output_t.options().device_opt(),
+            output_t.options().pinned_memory_opt());

-  TensorArg input  { input_t,  "input",  1 },
-            weight { weight_t, "weight", 2 };
-
-  Tensor output_t = at::detail::empty_cuda(
-      conv_output_size(
-        input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
-      input_t.options().memory_format(memory_format));
-  if (output_t.numel() == 0){
-    return output_t;
-  }
-  // Avoid ambiguity of "output" when this is being used as backwards
-  TensorArg output{output_t, "result", 0};
-  miopen_convolution_forward_out(
-      output,
-      "miopen_convolution_add_relu",
+  raw_miopen_convolution_add_relu_out(
+      output_t,
      input,
      weight,
-      padding,
+      z,
+      _alpha,
+      _bias,
      stride,
+      padding,
      dilation,
      groups,
      benchmark,
-      false // deterministic
-  );
+      true); // deterministic

-  auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
-
-  if (!output_t.is_same(contig_output_t)) {
-    contig_output_t.copy_(output_t);
-  }
-
-  auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
-  auto _bias = bias.has_value()
-          ? bias.value()
-          : at::zeros(
-                {contig_output_t.size(1)},
-                optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
-                contig_output_t.options().layout_opt(),
-                contig_output_t.options().device_opt(),
-                contig_output_t.options().pinned_memory_opt());
-
-  at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input_t.dim(), _bias).add(z, _alpha);
-  contig_output_t.add_(alpha_mul_z_add_bias);
-  contig_output_t.relu_();
-
-  return contig_output_t;
+  return output_t;
 }

 Tensor miopen_convolution_relu(
    const Tensor& input_t,
    const Tensor& weight_t,
-    const std::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias_t,
    IntArrayRef stride,
    IntArrayRef padding,
    IntArrayRef dilation,
    int64_t groups) {
+  auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
+  const Tensor input = input_t.contiguous(memory_format);
+  const Tensor weight = weight_t.contiguous(memory_format);
+
+  // FuseFrozenConvAddRelu performs some tensor shape checking
+  Tensor output_t = at::detail::empty_cuda(
+      conv_output_size(
+          input.sizes(), weight.sizes(), padding, stride, dilation),
+      input.options().memory_format(memory_format));
+  if (output_t.numel() == 0) {
+    return output_t;
+  }

  auto& ctx = at::globalContext();
  bool benchmark = ctx.benchmarkCuDNN();
+  auto _bias = bias_t.has_value()
+      ? bias_t.value()
+      : at::zeros(
+            {output_t.size(1)},
+            optTypeMetaToScalarType(output_t.options().dtype_opt()),
+            output_t.options().layout_opt(),
+            output_t.options().device_opt(),
+            output_t.options().pinned_memory_opt());

-  // MIOpen currently only supports MemoryFormat::Contiguous and fp32 and 2d
-  if (input_t.suggest_memory_format() == at::MemoryFormat::Contiguous
-          && input_t.scalar_type() == at::kFloat
-          && input_t.ndimension() == 4) {
+  raw_miopen_convolution_add_relu_out(
+      output_t,
+      input,
+      weight,
+      output_t, // use output_t as z to satisfy MIOpen API
+      0, // alpha
+      _bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      benchmark, // benchmark
+      true); // deterministic

-    // FuseFrozenConvAddRelu performs some tensor shape checking
-    Tensor output_t = at::detail::empty_cuda(
-        conv_output_size(
-            input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
-        input_t.options().memory_format(input_t.suggest_memory_format()));
-    if (output_t.numel() == 0) {
-      return output_t;
-    }
-
-    auto _bias = bias.has_value()
-            ? bias.value()
-            : at::zeros(
-                  {output_t.size(1)},
-                  optTypeMetaToScalarType(output_t.options().dtype_opt()),
-                  output_t.options().layout_opt(),
-                  output_t.options().device_opt(),
-                  output_t.options().pinned_memory_opt());
-
-    raw_miopen_convolution_relu_out(
-        output_t,
-        input_t,
-        weight_t,
-        _bias,
-        stride,
-        padding,
-        dilation,
-        groups,
-        benchmark, // benchmark
-        false // deterministic
-    );
-
-    return output_t;
-  }
-  else {
-    // fallback
-
-    auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
-
-    TensorArg input  { input_t,  "input",  1 },
-              weight { weight_t, "weight", 2 };
-
-    Tensor output_t = at::detail::empty_cuda(
-        conv_output_size(
-          input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
-        input->options().memory_format(memory_format));
-    if (output_t.numel() == 0){
-      return output_t;
-    }
-    // Avoid ambiguity of "output" when this is being used as backwards
-    TensorArg output{output_t, "result", 0};
-    miopen_convolution_forward_out(
-        output,
-        "miopen_convolution_relu",
-        input,
-        weight,
-        padding,
-        stride,
-        dilation,
-        groups,
-        benchmark,
-        false // deterministic
-    );
-
-    auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
-
-    if (!output_t.is_same(contig_output_t)) {
-      contig_output_t.copy_(output_t);
-    }
-
-    auto _bias = bias.has_value()
-            ? bias.value()
-            : at::zeros(
-                  {contig_output_t.size(1)},
-                  optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
-                  contig_output_t.options().layout_opt(),
-                  contig_output_t.options().device_opt(),
-                  contig_output_t.options().pinned_memory_opt());
-
-    at::Tensor reshaped_bias = at::native::reshape_bias(input_t.dim(), _bias);
-    contig_output_t.add_(reshaped_bias);
-    contig_output_t.relu_();
-
-    return contig_output_t;
-  }
+  return output_t;
 }

 REGISTER_CUDA_DISPATCH(miopen_convolution_backward_stub, &miopen_convolution_backward)
--- a/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
@ -559,4 +559,60 @@ Tensor _int_mm_xpu(const Tensor& self, const Tensor& mat2) {
      at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt));
  return _int_mm_out_xpu(self, mat2, result);
 }
+
+Tensor _weight_int8pack_mm_xpu(
+    const Tensor& A,
+    const Tensor& B,
+    const Tensor& scales) {
+  auto M = A.size(0);
+  auto N = B.size(0);
+  auto K = A.size(1);
+
+  TORCH_CHECK(
+      A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat,
+      " : expect A to be either 32-bit or 16-bit float tensor.");
+  TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor.");
+  TORCH_CHECK(
+      A.stride(1) == 1, " : A must be contiguous on the last dimension.");
+  TORCH_CHECK(B.dtype() == kChar, " : expect B to be int8 tensor.");
+  TORCH_CHECK(B.is_contiguous(), " : expect B to be contiguous.");
+  TORCH_CHECK(B.size(1) == K, " : expect B.size(1) == ", K);
+
+  TORCH_CHECK(
+      scales.dim() == 1 && scales.size(0) == N,
+      " : expect scales to be 1d tensor with size ",
+      N);
+
+  auto C = at::empty({M, N}, A.options());
+
+  // --- Launch kernel ---
+  Tensor bias = at::Tensor();
+  Tensor mat2_zero_points = at::Tensor();
+  Tensor non_const_scales = scales;
+  auto post_op_args = torch::List<std::optional<at::Scalar>>();
+
+  at::native::onednn::quantized_matmul(
+      A.contiguous(),
+      1.0,
+      0,
+      B,
+      non_const_scales,
+      mat2_zero_points,
+      bias,
+      C,
+      1.0,
+      0,
+      C.scalar_type(),
+      /*other*/ std::nullopt,
+      /*other scale*/ 1.0,
+      /*other zp*/ 0,
+      /*binary post op*/ "none",
+      /*binary alpha*/ 1.0,
+      /*post_op_name*/ "none",
+      post_op_args,
+      /*post_op_algorithm*/ "none",
+      /*m2_trans*/ false);
+
+  return C;
+}
 } // namespace at::native
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
@ -110,8 +110,9 @@ void quantized_matmul(
  // [Note] Quantized Matrix Multiplication at XPU
  // The following code integrates oneDNN quantized gemm. The quantization
  // config we support:
-  // activation: s8&u8; per tensor calibrated; symmetric&asymmetric
-  // weight: s8; per_tensor/per_channel calibrated; symmetric
+  // activation: s8, u8, fp16, bf16, fp32; per tensor calibrated;
+  // symmetric&asymmetric weight: s8; per_tensor/per_channel calibrated;
+  // symmetric
  auto attr = Attr(static_cast<float>(1.0 / output_scale), output_zero_point);
  construct_attr_by_post_op(
      binary_post_op,
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@ -568,7 +568,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
    MPSShape* mpsStrides = getMPSShape(_tensor.strides());
    check_mps_shape(mpsShape);

-    auto storage_numel = src.storage().nbytes() / src.element_size();
+    auto storage_numel = src.storage().nbytes() / src.element_size() - src.storage_offset();
    TORCH_CHECK(storage_numel <= std::numeric_limits<int32_t>::max(),
                "MPSGaph does not support tensor dims larger than INT_MAX");
    MPSNDArrayDescriptor* srcTensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:dataType
--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
@ -0,0 +1,25 @@
+#pragma once
+#include <c10/metal/common.h>
+
+#ifdef __METAL__
+enum class EmbeddingBagMode { SUM = 0, MEAN, MAX };
+#else
+#include <ATen/native/EmbeddingBag.h>
+using at::native::EmbeddingBagMode;
+#endif
+
+template <typename idx_type_t = uint32_t>
+struct EmbeddingBagParams {
+  ::c10::metal::array<idx_type_t, 2> weight_strides;
+  ::c10::metal::array<idx_type_t, 2> output_strides;
+  ::c10::metal::array<idx_type_t, 2> max_indices_strides;
+
+  idx_type_t per_sample_weights_strides;
+
+  idx_type_t num_indices;
+  idx_type_t num_bags;
+  idx_type_t feature_size;
+
+  EmbeddingBagMode mode;
+  int64_t padding_idx;
+};
--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
@ -0,0 +1,212 @@
+#include <ATen/native/mps/kernels/EmbeddingBag.h>
+#include <c10/metal/utils.h>
+#include <metal_array>
+#include <metal_stdlib>
+
+using namespace metal;
+using namespace c10::metal;
+
+template <EmbeddingBagMode M, typename T>
+struct ReductionOpInit {
+  inline opmath_t<T> operator()() {
+    return 0;
+  }
+};
+
+template <typename T>
+struct ReductionOpInit<EmbeddingBagMode::MAX, T> {
+  inline opmath_t<T> operator()() {
+    return static_cast<opmath_t<T>>(-INFINITY);
+  }
+};
+
+template <EmbeddingBagMode M, typename T>
+struct ReductionOp {
+  inline opmath_t<T> operator()(
+      T weight_val,
+      opmath_t<T> out_val,
+      uint32_t per_sample_weights_index,
+      constant T* per_sample_weights,
+      uint32_t per_sample_weights_strides);
+};
+
+template <typename T>
+struct ReductionOp<EmbeddingBagMode::SUM, T> {
+  inline opmath_t<T> operator()(
+      T weight_val,
+      opmath_t<T> out_val,
+      uint32_t per_sample_weights_index,
+      constant T* per_sample_weights,
+      uint32_t per_sample_weights_strides) {
+    if (per_sample_weights_strides) {
+      T per_sample_weight = per_sample_weights
+          [per_sample_weights_strides * per_sample_weights_index];
+      return static_cast<opmath_t<T>>(per_sample_weight) *
+          static_cast<opmath_t<T>>(weight_val) +
+          out_val;
+    } else {
+      return static_cast<opmath_t<T>>(weight_val) + out_val;
+    }
+  }
+};
+
+template <typename T>
+struct ReductionOp<EmbeddingBagMode::MEAN, T> {
+  inline opmath_t<T> operator()(
+      T weight_val,
+      opmath_t<T> out_val,
+      uint32_t,
+      constant T*,
+      uint32_t) {
+    return static_cast<opmath_t<T>>(weight_val) + out_val;
+  }
+};
+
+template <typename T>
+struct ReductionOp<EmbeddingBagMode::MAX, T> {
+  inline opmath_t<T> operator()(
+      T weight_val,
+      opmath_t<T> out_val,
+      uint32_t,
+      constant T*,
+      uint32_t) {
+    return max(static_cast<opmath_t<T>>(weight_val), out_val);
+  }
+};
+
+template <EmbeddingBagMode M, typename T>
+struct ReductionOpFinal {
+  inline T operator()(opmath_t<T> val, uint32_t) {
+    return static_cast<T>(val);
+  }
+};
+
+template <typename T>
+struct ReductionOpFinal<EmbeddingBagMode::MEAN, T> {
+  inline T operator()(opmath_t<T> val, uint32_t count) {
+    auto out = val / count;
+    return static_cast<T>((count == 0) ? 0 : out);
+  }
+};
+
+template <typename T>
+struct ReductionOpFinal<EmbeddingBagMode::MAX, T> {
+  inline T operator()(opmath_t<T> val, uint32_t count) {
+    return static_cast<T>((count == 0) ? 0 : val);
+  }
+};
+
+template <EmbeddingBagMode M, typename T, typename I>
+void embedding_bag_impl(
+    constant T* weight,
+    constant I* indices,
+    constant I* offsets,
+    constant T* per_sample_weights,
+    device T* output,
+    device I* offset2bag,
+    device I* bag_size,
+    device I* max_indices,
+    constant EmbeddingBagParams<uint32_t>& params,
+    uint tid) {
+  auto num_indices = params.num_indices;
+  auto num_bags = params.num_bags;
+  auto feature_size = params.feature_size;
+  auto padding_idx = params.padding_idx;
+  auto per_sample_weights_strides = params.per_sample_weights_strides;
+  constant auto& output_strides = params.output_strides;
+  constant auto& weight_strides = params.weight_strides;
+  constant auto& max_indices_strides = params.max_indices_strides;
+
+  auto bag_idx = tid / feature_size;
+  auto feature_idx = tid % feature_size;
+
+  output += bag_idx * output_strides[0] + feature_idx * output_strides[1];
+
+  uint32_t offsets_end = min(bag_idx + 1, num_bags - 1);
+  bool is_last_bag = bag_idx + 1 == num_bags;
+  uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]);
+  uint32_t indices_end = is_last_bag * (num_indices) +
+      (!is_last_bag) * (static_cast<uint32_t>(offsets[offsets_end]));
+
+  auto out_val = ReductionOpInit<M, T>()();
+
+  uint32_t bag_size_ = 0;
+
+  for (uint32_t indices_idx = indices_start; indices_idx < indices_end;
+       indices_idx++) {
+    I weight_idx = indices[indices_idx];
+    bool pad = (weight_idx == padding_idx);
+    T weight_val = weight
+        [static_cast<uint32_t>(weight_idx) * weight_strides[0] +
+         feature_idx * weight_strides[1]];
+
+    bag_size_ += static_cast<uint32_t>(!pad);
+
+    auto tmp_val = ReductionOp<M, T>()(
+        weight_val,
+        out_val,
+        indices_idx,
+        per_sample_weights,
+        per_sample_weights_strides);
+
+    out_val = pad ? out_val : tmp_val;
+  }
+
+  *output = ReductionOpFinal<M, T>()(out_val, bag_size_);
+}
+
+#define DISPATCH_IMPL(MODE)        \
+  return embedding_bag_impl<MODE>( \
+      weight,                      \
+      indices,                     \
+      offsets,                     \
+      per_sample_weights,          \
+      output,                      \
+      offset2bag,                  \
+      bag_size,                    \
+      max_indices,                 \
+      params,                      \
+      tid)
+
+template <typename T, typename I>
+kernel void embedding_bag(
+    constant T* weight [[buffer(0)]],
+    constant I* indices [[buffer(1)]],
+    constant I* offsets [[buffer(2)]],
+    constant T* per_sample_weights [[buffer(3)]],
+    device T* output [[buffer(4)]],
+    device I* offset2bag [[buffer(5)]],
+    device I* bag_size [[buffer(6)]],
+    device I* max_indices [[buffer(7)]],
+    constant EmbeddingBagParams<uint32_t>& params [[buffer(8)]],
+    uint tid [[thread_position_in_grid]]) {
+  switch (params.mode) {
+    case EmbeddingBagMode::SUM:
+      DISPATCH_IMPL(EmbeddingBagMode::SUM);
+    case EmbeddingBagMode::MEAN:
+      DISPATCH_IMPL(EmbeddingBagMode::MEAN);
+    case EmbeddingBagMode::MAX:
+      DISPATCH_IMPL(EmbeddingBagMode::MAX);
+  }
+}
+
+#define REGISTER_EMBEDDING_BAG_OP(T, I)                             \
+  template [[host_name("embedding_bag_" #T "_" #I)]]                \
+  kernel void embedding_bag<T, I>(                                  \
+      constant T * weight [[buffer(0)]],                            \
+      constant I * indices [[buffer(1)]],                           \
+      constant I * offsets [[buffer(2)]],                           \
+      constant T * per_sample_weights [[buffer(3)]],                \
+      device T * output [[buffer(4)]],                              \
+      device I * offset2bag [[buffer(5)]],                          \
+      device I * bag_size [[buffer(6)]],                            \
+      device I * max_indices [[buffer(7)]],                         \
+      constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]], \
+      uint tid [[thread_position_in_grid]]);
+
+REGISTER_EMBEDDING_BAG_OP(float, int);
+REGISTER_EMBEDDING_BAG_OP(float, long);
+REGISTER_EMBEDDING_BAG_OP(half, int);
+REGISTER_EMBEDDING_BAG_OP(half, long);
+REGISTER_EMBEDDING_BAG_OP(bfloat, int);
+REGISTER_EMBEDDING_BAG_OP(bfloat, long);
--- a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
+++ b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
@ -0,0 +1,179 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/TensorUtils.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/EmbeddingBag.h>
+#include <ATen/native/Pool.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/mps/kernels/EmbeddingBag.h>
+
+#include <fmt/format.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_embedding_bag_forward_only_native.h>
+#include <ATen/ops/_embedding_bag_native.h>
+#include <ATen/ops/empty.h>
+#endif
+
+namespace at::native {
+
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
+#else
+#include <ATen/native/mps/EmbeddingBag_metallib.h>
+#endif
+
+namespace {
+
+std::pair<Tensor, Tensor> promoteIndicesAndOffsets(const Tensor& indices, const Tensor& offsets) {
+  const auto commonType = promoteTypes(offsets.scalar_type(), indices.scalar_type());
+  return {indices.scalar_type() == commonType ? indices : indices.toType(commonType),
+          offsets.scalar_type() == commonType ? offsets : offsets.toType(commonType)};
+}
+
+} // namespace
+
+namespace mps {
+
+static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
+    const Tensor& weight,
+    const Tensor& indices_,
+    const Tensor& offsets_,
+    const bool scale_grad_by_freq,
+    const int64_t mode,
+    bool sparse,
+    const std::optional<Tensor>& per_sample_weights_opt,
+    bool include_last_offset,
+    int64_t padding_idx) {
+  TORCH_CHECK(indices_.dim() == 1, "input has to be a 1D Tensor, but got Tensor of dimension ", indices_.dim());
+  if (indices_.dim() == 1) {
+    TORCH_CHECK(offsets_.dim() == 1, "offsets has to be a 1D Tensor, but got Tensor of dimension ", offsets_.dim());
+  }
+  TORCH_CHECK(weight.dim() == 2, "weight has to be a 2D Tensor, but got Tensor of dimension ", weight.dim());
+
+  Tensor indices, offsets;
+  std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_);
+  auto indices_arg = TensorArg(indices, "indices", 1);
+  checkScalarTypes("embedding_bag_mps", indices_arg, {kLong, kInt});
+  auto offsets_arg = TensorArg(offsets, "offsets", 1);
+  checkScalarTypes("embedding_bag_mps", offsets_arg, {kLong, kInt});
+  checkSameType("embedding_bag_mps", indices_arg, offsets_arg);
+  auto weight_arg = TensorArg(weight, "weight", 1);
+
+  int64_t num_indices = indices.size(0);
+  int64_t num_bags = offsets.size(0);
+  if (include_last_offset) {
+    num_bags -= 1;
+  }
+  int64_t feature_size = weight.size(1);
+
+  auto bag_size = at::empty(offsets.sizes(), indices.options());
+  auto offset2bag = at::empty({indices.size(0)}, indices.options());
+  auto output = at::empty({num_bags, feature_size}, weight.options());
+
+  Tensor max_indices;
+
+  if (mode == EmbeddingBagMode::MAX) {
+    max_indices = at::empty({num_bags, feature_size}, indices.options());
+  } else {
+    max_indices = at::empty({0}, indices.options());
+  }
+
+  EmbeddingBagParams<uint32_t> params;
+
+  for (const auto dim : c10::irange(weight.dim())) {
+    params.weight_strides[dim] = safe_downcast<uint32_t, int64_t>(weight.stride(dim));
+    params.output_strides[dim] = safe_downcast<uint32_t, int64_t>(output.stride(dim));
+
+    if (mode == EmbeddingBagMode::MAX) {
+      params.max_indices_strides[dim] = safe_downcast<uint32_t, int64_t>(max_indices.stride(dim));
+    }
+  }
+
+  bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
+  params.per_sample_weights_strides = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
+
+  params.num_indices = num_indices;
+  params.num_bags = num_bags;
+  params.feature_size = feature_size;
+  params.mode = static_cast<EmbeddingBagMode>(mode);
+  params.padding_idx = padding_idx;
+
+  auto num_threads = output.numel();
+  MPSStream* stream = getCurrentMPSStream();
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
+      auto pipeline_state = lib.getPipelineStateForFunc(
+          fmt::format("embedding_bag_{}_{}", scalarToMetalTypeString(weight), scalarToMetalTypeString(indices)));
+
+      getMPSProfiler().beginProfileKernel(pipeline_state, "embedding_bag", {weight, indices, offsets});
+      [computeEncoder setComputePipelineState:pipeline_state];
+      mtl_setArgs(computeEncoder,
+                  weight,
+                  indices,
+                  offsets,
+                  use_per_sample_weights ? per_sample_weights_opt : std::nullopt,
+                  output,
+                  offset2bag,
+                  bag_size,
+                  max_indices,
+                  params);
+
+      mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
+      getMPSProfiler().endProfileKernel(pipeline_state);
+    }
+  });
+
+  return std::tuple<Tensor, Tensor, Tensor, Tensor>(
+      std::move(output), std::move(offset2bag), std::move(bag_size), std::move(max_indices));
+}
+
+} // namespace mps
+
+std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps(const Tensor& weight,
+                                                              const Tensor& indices,
+                                                              const Tensor& offsets,
+                                                              const bool scale_grad_by_freq,
+                                                              const int64_t mode,
+                                                              bool sparse,
+                                                              const std::optional<Tensor>& per_sample_weights_opt,
+                                                              bool include_last_offset,
+                                                              int64_t padding_idx) {
+  return mps::_embedding_bag_mps_impl(weight,
+                                      indices,
+                                      offsets,
+                                      scale_grad_by_freq,
+                                      mode,
+                                      sparse,
+                                      per_sample_weights_opt,
+                                      include_last_offset,
+                                      padding_idx);
+}
+
+std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_forward_only_mps(
+    const Tensor& weight,
+    const Tensor& indices,
+    const Tensor& offsets,
+    const bool scale_grad_by_freq,
+    const int64_t mode,
+    bool sparse,
+    const std::optional<Tensor>& per_sample_weights_opt,
+    bool include_last_offset,
+    int64_t padding_idx) {
+  return _embedding_bag_mps(weight,
+                            indices,
+                            offsets,
+                            scale_grad_by_freq,
+                            mode,
+                            sparse,
+                            per_sample_weights_opt,
+                            include_last_offset,
+                            padding_idx);
+}
+
+} // namespace at::native
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@ -534,6 +534,18 @@ static void max_unpool_out_mps_template(const Tensor& input,
  output.resize_(output_size, memory_format);
  output.fill_(0);

+  if (indices.defined() && indices.numel() > 0) {
+    auto output_image_size = c10::multiply_integers(output_size_);
+
+    int64_t min_idx = indices.min().item<int64_t>();
+    int64_t max_idx = indices.max().item<int64_t>();
+
+    if (min_idx < 0 || max_idx >= output_image_size) {
+      int64_t error_idx = (min_idx < 0) ? min_idx : max_idx;
+      TORCH_CHECK(false, "Found an invalid max index: ", error_idx, " for output tensor of shape ", output_size_);
+    }
+  }
+
  id<MTLDevice> device = MPSDevice::getInstance()->device();
  MPSStream* mpsStream = getCurrentMPSStream();
  const auto numThreads = input.numel();
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2351,6 +2351,7 @@
  dispatch:
    CPU: _embedding_bag_forward_only_cpu
    CUDA: _embedding_bag_forward_only_cuda
+    MPS: _embedding_bag_forward_only_mps
  autogen: _embedding_bag_forward_only.out

 - func: _rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor)
@ -2372,6 +2373,7 @@
  dispatch:
    CPU: _embedding_bag_cpu
    CUDA: _embedding_bag_cuda
+    MPS: _embedding_bag_mps
  autogen: _embedding_bag.out
  tags: core

@ -4241,6 +4243,7 @@
    CPU: _weight_int8pack_mm_cpu
    CUDA: _weight_int8pack_mm_cuda
    MPS: _weight_int8pack_mm_mps
+    XPU: _weight_int8pack_mm_xpu

 - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
  python_module: sparse
@ -4372,7 +4375,7 @@
  variants: function, method
  dispatch:
    CPU: narrow_copy_dense_cpu
-    SparseCPU, SparseCUDA: narrow_copy_sparse
+    SparseCPU, SparseCUDA, SparseMPS: narrow_copy_sparse
    CompositeExplicitAutogradNonFunctional: narrow_copy_dense_symint
  tags: view_copy

@ -6660,7 +6663,7 @@
 - func: zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
    CompositeExplicitAutograd: zeros_out
-    SparseCPU, SparseCUDA, SparseMeta: zeros_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: zeros_sparse_out

 - func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
  dispatch:
@ -10699,6 +10702,7 @@
  dispatch:
    CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow
    CUDA: foreach_tensor_div_list_kernel_cuda
+    MTIA: foreach_tensor_div_list_kernel_mtia

 - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@ -10706,6 +10710,7 @@
  dispatch:
    CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow_
    CUDA: foreach_tensor_div_list_kernel_cuda_
+    MTIA: foreach_tensor_div_list_kernel_mtia_
  autogen: _foreach_div.List_out

 - func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
@ -10729,6 +10734,7 @@
  dispatch:
    CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow
    CUDA: foreach_tensor_div_tensor_kernel_cuda
+    MTIA: foreach_tensor_div_tensor_kernel_mtia

 - func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@ -10736,6 +10742,7 @@
  dispatch:
    CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow_
    CUDA: foreach_tensor_div_tensor_kernel_cuda_
+    MTIA: foreach_tensor_div_tensor_kernel_mtia_
  autogen: _foreach_div.Tensor_out

 - func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
@ -10842,6 +10849,7 @@
  dispatch:
    CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
    CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
+    MTIA: foreach_tensor_maximum_scalar_kernel_mtia_
  autogen: _foreach_maximum.Scalar_out

 # foreach_minimum/maximum dispatches to clamp_max/min
--- a/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp
@ -77,7 +77,7 @@ static Tensor NestedTensor_elementwise_Tensor(
    const Tensor& other,
    const std::string& op_name,
    bool supports_striding,
-    Func f) {
+    const Func& f) {
  Tensor self_contiguous = self;
  Tensor other_contiguous = other;
  // self is a scalar
@ -238,7 +238,7 @@ static Tensor& NestedTensor_elementwise__Tensor(
    Tensor& self,
    const Tensor& other,
    const std::string& op_name,
-    Func f) {
+    const Func& f) {
  // self is a scalar
  if (!self.is_nested() && self.dim() == 0 && self.numel() == 1) {
    auto other_impl = get_nested_tensor_impl(other);
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@ -149,7 +149,7 @@ Tensor MakeStridedQTensorCPU(
    const IntArrayRef& sizes,
    const IntArrayRef& strides,
    const TensorOptions& options,
-    QuantizerPtr quantizer) {
+    const QuantizerPtr& quantizer) {
  AT_ASSERT(options.device().is_cpu());
  at::native::check_size_nonnegative(sizes);
  auto* allocator = at::getCPUAllocator();
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
@ -37,7 +37,7 @@ struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase {
        col_offsets(std::move(col_offsets)),
        w_scale(std::move(w_scale)),
        w_zp(std::move(w_zp)),
-        q_scheme(std::move(q_scheme)) {}
+        q_scheme(q_scheme) {}
  std::unique_ptr<fbgemm::PackBMatrix<int8_t>> w;
  std::optional<at::Tensor> bias_;
  std::vector<int32_t> col_offsets;
@ -316,7 +316,7 @@ Tensor MakeStridedQTensorCPU(
    const IntArrayRef& sizes,
    const IntArrayRef& strides,
    const TensorOptions& options,
-    QuantizerPtr quantizer);
+    const QuantizerPtr& quantizer);

 Tensor MakeEmptyAffineQuantizedChannelsLast3dTensor(
    int64_t N,
--- a/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
+++ b/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
@ -1,5 +1,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/ceil_div.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <c10/cuda/CUDAGuard.h>
@ -21,10 +22,11 @@
 namespace at::native {

 namespace {
+template <typename T>
 __global__ void ChooseQuantizationParamsKernelImpl(
    const int64_t* fake_quant_on,
-    const float* x_min,
-    const float* x_max,
+    const T* x_min,
+    const T* x_max,
    int32_t qmin,
    int32_t qmax,
    int size,
@ -93,34 +95,44 @@ __global__ void ChooseQuantizationParamsKernelImpl(
  }
 }

+__device__ inline bool isinf_device(float v) {
+  return ::isinf(v);
+}
+__device__ inline bool isinf_device(c10::BFloat16 v) {
+  return ::isinf(static_cast<float>(v));
+}
+
 // CUDA kernel to compute Moving Average Min/Max of the tensor.
 // It uses the running_min and running_max along with averaging const, c.
 // The formula used to compute the new min/max is as follows
 //
 // running_min = (1 - c) * running_min + c * x_min, if running_min != inf
 // running_min = x_min, if running_min == inf
+template <typename T>
 __global__ void MovingAverageMinMax(
    const int64_t* observer_on,
-    const float* x_min,
-    const float* x_max,
-    float* running_min,
-    float* running_max,
+    const T* x_min,
+    const T* x_max,
+    T* running_min,
+    T* running_max,
    const float averaging_const,
    const int size) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;

  if (*observer_on == 1) {
    if (i < size) {
-      float curr_min = x_min[i];
-      float curr_max = x_max[i];
+      T curr_min = x_min[i];
+      T curr_max = x_max[i];

-      float adjusted_min = ::isinf(running_min[i])
-          ? curr_min
-          : (running_min[i]) + averaging_const * (curr_min - (running_min[i]));
+      T averaging_const_t = static_cast<T>(averaging_const);

-      float adjusted_max = ::isinf(running_max[i])
-          ? curr_max
-          : (running_max[i]) + averaging_const * (curr_max - (running_max[i]));
+      T adjusted_min = isinf_device(running_min[i]) ? curr_min
+                                                    : (running_min[i]) +
+              averaging_const_t * (curr_min - (running_min[i]));
+
+      T adjusted_max = isinf_device(running_max[i]) ? curr_max
+                                                    : (running_max[i]) +
+              averaging_const_t * (curr_max - (running_max[i]));

      running_min[i] = adjusted_min;
      running_max[i] = adjusted_max;
@ -142,40 +154,51 @@ void _calculate_moving_average(
  at::Tensor x_min, x_max;

  int64_t* observer_on_data = observer_on.data_ptr<int64_t>();
-  float* running_min_data = running_min.data_ptr<float>();
-  float* running_max_data = running_max.data_ptr<float>();
  cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();

  if (per_row_fq) {
    std::tie(x_min, x_max) = at::aminmax(x, 1);
-    float* x_min_data = x_min.data_ptr<float>();
-    float* x_max_data = x_max.data_ptr<float>();
    int num_threads = std::min(size, (int64_t)512);
    const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
+          scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
+          scalar_t* x_max_data = x_max.data_ptr<scalar_t>();

-    // Moving Average Min/Max observer for activations
-    MovingAverageMinMax<<<num_blocks, num_threads, 0, cuda_stream>>>(
-        observer_on_data,
-        x_min_data,
-        x_max_data,
-        running_min_data,
-        running_max_data,
-        averaging_const,
-        size);
+          scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
+          scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
+
+          // Moving Average Min/Max observer for activations
+          MovingAverageMinMax<<<num_blocks, num_threads, 0, cuda_stream>>>(
+              observer_on_data,
+              x_min_data,
+              x_max_data,
+              running_min_data,
+              running_max_data,
+              averaging_const,
+              size);
+        });
    C10_CUDA_KERNEL_LAUNCH_CHECK();
  } else {
    std::tie(x_min, x_max) = at::aminmax(x);
-    float* x_min_data = x_min.data_ptr<float>();
-    float* x_max_data = x_max.data_ptr<float>();
-    // Moving Average Min/Max observer for activations
-    MovingAverageMinMax<<<1, 1, 0, cuda_stream>>>(
-        observer_on_data,
-        x_min_data,
-        x_max_data,
-        running_min_data,
-        running_max_data,
-        averaging_const,
-        1 /*size*/);
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
+          scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
+          scalar_t* x_max_data = x_max.data_ptr<scalar_t>();
+
+          scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
+          scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
+
+          // Moving Average Min/Max observer for activations
+          MovingAverageMinMax<<<1, 1, 0, cuda_stream>>>(
+              observer_on_data,
+              x_min_data,
+              x_max_data,
+              running_min_data,
+              running_max_data,
+              averaging_const,
+              1 /*size*/);
+        });
    C10_CUDA_KERNEL_LAUNCH_CHECK();
  }
 }
@ -198,34 +221,44 @@ void _calc_moving_avg_qparams_helper(
  cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
  int64_t* fake_quant_on_data = fake_quant_on.data_ptr<int64_t>();
  if (per_row_fq) {
-    float* running_min_data = running_min.data_ptr<float>();
-    float* running_max_data = running_max.data_ptr<float>();
-    int num_threads = std::min(size, (int64_t)512);
-    const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
-    ChooseQuantizationParamsKernelImpl<<<num_blocks, num_threads, 0, cuda_stream>>>(
-        fake_quant_on_data,
-        running_min_data,
-        running_max_data,
-        qmin,
-        qmax,
-        size,
-        symmetric_quant,
-        scale_ptr,
-        zp_ptr);
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
+          scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
+          scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
+          int num_threads = std::min(size, (int64_t)512);
+          const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
+          ChooseQuantizationParamsKernelImpl<<<
+              num_blocks,
+              num_threads,
+              0,
+              cuda_stream>>>(
+              fake_quant_on_data,
+              running_min_data,
+              running_max_data,
+              qmin,
+              qmax,
+              size,
+              symmetric_quant,
+              scale_ptr,
+              zp_ptr);
+        });
    C10_CUDA_KERNEL_LAUNCH_CHECK();
  } else {
-    float* running_min_data = running_min.data_ptr<float>();
-    float* running_max_data = running_max.data_ptr<float>();
-    ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>(
-        fake_quant_on_data,
-        running_min_data,
-        running_max_data,
-        qmin,
-        qmax,
-        1, // size
-        symmetric_quant, // preserve_sparsity
-        scale_ptr,
-        zp_ptr);
+    AT_DISPATCH_FLOATING_TYPES_AND(
+        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
+          scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
+          scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
+          ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>(
+              fake_quant_on_data,
+              running_min_data,
+              running_max_data,
+              qmin,
+              qmax,
+              1, // size
+              symmetric_quant, // preserve_sparsity
+              scale_ptr,
+              zp_ptr);
+        });
    C10_CUDA_KERNEL_LAUNCH_CHECK();
  }
 }
--- a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
+++ b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
@ -64,7 +64,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
  // create sparse descriptor, dtype
  cusparseLtMatDescriptor_t sparse_input_descriptor;
  cudaDataType type;
-  auto compression_factor = 9;

  #ifdef USE_ROCM
  TORCH_CHECK(isHipSparseLtSupported());
@ -73,7 +72,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
  switch (sparse_input.scalar_type()) {
    case at::ScalarType::Char:
      type = CUDA_R_8I;
-      compression_factor = 10;
      break;
    case at::ScalarType::Half:
      type = CUDA_R_16F;
@ -89,7 +87,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
 #if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602 && !defined(USE_ROCM)
    case at::ScalarType::Float8_e4m3fn:
      type = CUDA_R_8F_E4M3;
-      compression_factor = 10;
      break;
 #endif
    default:
@ -97,10 +94,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
      break;
  }

-  // create a new compressed tensor with the same dtype as
-  auto compressed_tensor =
-      sparse_input.new_empty(sparse_input.numel() * compression_factor / 16);
-
  TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
      &handle,
      &sparse_input_descriptor,
@ -121,6 +114,15 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
      &compressed_size,
      &compressed_buffer_size));

+  // create a new compressed tensor with the same dtype as the input,
+  // and with packed data/metadata stored in an array with original
+  // number of rows, and sufficient columns to provide compressed_size
+  // buffer (in bytes)
+  size_t orig_m = sparse_input.size(0);
+  size_t div = orig_m * sparse_input.itemsize();
+  size_t new_n = (compressed_size + div - 1) / div; // floor
+  auto compressed_tensor = sparse_input.new_empty({(int64_t)orig_m, (int64_t)new_n});
+
  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
  auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@ -165,7 +167,6 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
  cudaDataType output_type;
  cudaDataType C_type;
  cusparseComputeType compute_type;
-  auto compression_factor = 9;

  #ifdef USE_ROCM
  TORCH_CHECK(isHipSparseLtSupported());
@ -177,7 +178,6 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
      output_type = CUDA_R_8I;
      C_type = CUDA_R_8I;
      compute_type = CUSPARSE_COMPUTE_32I;
-      compression_factor = 10;
      break;

 // cuSPARSELt v0.5.2 onwards changes CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUT_16F
@ -210,7 +210,6 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
      output_type = CUDA_R_8F_E4M3;
      C_type = CUDA_R_16F;
      compute_type = CUSPARSE_COMPUTE_32F;
-      compression_factor = 10;
      break;
 #endif
 // cuSPARSELt <= v0.5.2 uses CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUTE_16F
@ -300,9 +299,10 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
    }
  }

+  TORCH_INTERNAL_ASSERT(compressed_A.dim() == 2); // encoded M x S
  int64_t k = dense_B.size(0);
  int64_t n = dense_B.size(1);
-  int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
+  int64_t m = compressed_A.size(0);

  // initialize sparse descriptor
  cusparseLtMatDescriptor_t sparse_input_descriptor;
--- a/aten/src/ATen/quantized/QTensorImpl.cpp
+++ b/aten/src/ATen/quantized/QTensorImpl.cpp
@ -7,7 +7,7 @@ QTensorImpl::QTensorImpl(
    DispatchKeySet key_set,
    const caffe2::TypeMeta data_type,
    QuantizerPtr quantizer)
-    : TensorImpl(std::move(storage), std::move(key_set), data_type),
+    : TensorImpl(std::move(storage), key_set, data_type),
      quantizer_(std::move(quantizer)) {}

 QTensorImpl::QTensorImpl(
@ -16,7 +16,7 @@ QTensorImpl::QTensorImpl(
    DispatchKeySet key_set,
    const caffe2::TypeMeta data_type,
    QuantizerPtr quantizer)
-    : TensorImpl(type, std::move(storage), std::move(key_set), data_type),
+    : TensorImpl(type, std::move(storage), key_set, data_type),
      quantizer_(std::move(quantizer)) {}

 const char* QTensorImpl::tensorimpl_type_name() const {
--- a/aten/src/ATen/quantized/QTensorImpl.h
+++ b/aten/src/ATen/quantized/QTensorImpl.h
@ -4,6 +4,8 @@
 #include <c10/core/TensorImpl.h>
 #include <c10/util/Exception.h>

+#include <utility>
+
 namespace at {

 /**
@ -36,7 +38,7 @@ struct TORCH_API QTensorImpl : public c10::TensorImpl {
  }

  void set_quantizer_(QuantizerPtr quantizer) {
-    quantizer_ = quantizer;
+    quantizer_ = std::move(quantizer);
  }

  /**
--- a/Show More
+++ b/Show More