mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-10-31 20:34:54 +08:00 
			
		
		
		
	Compare commits
	
		
			3 Commits
		
	
	
		
			feature/ju
			...
			disp_count
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| e4de72ea5d | |||
| d9258fb366 | |||
| ef78f99412 | 
| @ -31,7 +31,8 @@ pip install -r /pytorch/requirements.txt | ||||
| pip install auditwheel==6.2.0 wheel | ||||
| if [ "$DESIRED_CUDA" = "cpu" ]; then | ||||
|     echo "BASE_CUDA_VERSION is not set. Building cpu wheel." | ||||
|     python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn | ||||
|     #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files | ||||
|     USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn | ||||
| else | ||||
|     echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA" | ||||
|     export USE_SYSTEM_NCCL=1 | ||||
| @ -45,5 +46,6 @@ else | ||||
|         export USE_NVIDIA_PYPI_LIBS=1 | ||||
|     fi | ||||
|  | ||||
|     python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda | ||||
|     #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files | ||||
|     USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda | ||||
| fi | ||||
|  | ||||
| @ -317,7 +317,7 @@ if __name__ == "__main__": | ||||
|     ).decode() | ||||
|  | ||||
|     print("Building PyTorch wheel") | ||||
|     build_vars = "" | ||||
|     build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " | ||||
|     # MAX_JOB=5 is not required for CPU backend (see commit 465d98b) | ||||
|     if enable_cuda: | ||||
|         build_vars += "MAX_JOBS=5 " | ||||
|  | ||||
| @ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5): | ||||
|         try: | ||||
|             with socket.create_connection((addr, port), timeout=timeout): | ||||
|                 return | ||||
|         except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203 | ||||
|         except (ConnectionRefusedError, socket.timeout):  # noqa: PERF203 | ||||
|             if i == attempt_cnt - 1: | ||||
|                 raise | ||||
|             time.sleep(timeout) | ||||
|  | ||||
| @ -262,10 +262,13 @@ case "$tag" in | ||||
|     TRITON_CPU=yes | ||||
|     ;; | ||||
|   pytorch-linux-jammy-linter) | ||||
|     PYTHON_VERSION=3.10 | ||||
|     # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627. | ||||
|     # We will need to update mypy version eventually, but that's for another day. The task | ||||
|     # would be to upgrade mypy to 1.0.0 with Python 3.11 | ||||
|     PYTHON_VERSION=3.9 | ||||
|     ;; | ||||
|   pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter) | ||||
|     PYTHON_VERSION=3.10 | ||||
|   pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter) | ||||
|     PYTHON_VERSION=3.9 | ||||
|     CUDA_VERSION=12.8.1 | ||||
|     ;; | ||||
|   pytorch-linux-jammy-aarch64-py3.10-gcc11) | ||||
|  | ||||
| @ -1 +1 @@ | ||||
| e0dda9059d082537cee36be6c5e4fe3b18c880c0 | ||||
| 56392aa978594cc155fa8af48cd949f5b5f1823a | ||||
|  | ||||
| @ -1,2 +1,2 @@ | ||||
| transformers==4.56.0 | ||||
| transformers==4.54.0 | ||||
| soxr==0.5.0 | ||||
|  | ||||
| @ -1 +1 @@ | ||||
| bbb06c0334a6772b92d24bde54956e675c8c6604 | ||||
| 5ae38bdb0dc066c5823e34dc9797afb9de42c866 | ||||
|  | ||||
| @ -42,27 +42,22 @@ install_pip_dependencies() { | ||||
|   # A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current | ||||
|   # numba and scipy version used in PyTorch CI | ||||
|   conda_run pip uninstall -y numba scipy | ||||
|   # Yaspin is needed for running CI test (get_benchmark_analysis_data.py) | ||||
|   pip_install yaspin==3.1.0 | ||||
|  | ||||
|   popd | ||||
| } | ||||
|  | ||||
| setup_executorch() { | ||||
|   pushd executorch | ||||
|  | ||||
|   export PYTHON_EXECUTABLE=python | ||||
|   export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON" | ||||
|   export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" | ||||
|  | ||||
|   as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true | ||||
|   popd | ||||
| } | ||||
|  | ||||
| if [ $# -eq 0 ]; then | ||||
|   clone_executorch | ||||
|   install_buck2 | ||||
|   install_conda_dependencies | ||||
|   install_pip_dependencies | ||||
|   pushd executorch | ||||
|   setup_executorch | ||||
|   popd | ||||
| else | ||||
|   "$@" | ||||
| fi | ||||
| clone_executorch | ||||
| install_buck2 | ||||
| install_conda_dependencies | ||||
| install_pip_dependencies | ||||
| setup_executorch | ||||
|  | ||||
| @ -93,9 +93,8 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x" | ||||
| #Pinned versions: | ||||
| #test that import: | ||||
|  | ||||
| mypy==1.16.0 ; platform_system != "Windows" | ||||
| mypy==1.16.0 | ||||
| # Pin MyPy version because new errors are likely to appear with each release | ||||
| # Skip on Windows as lots of type annotations are POSIX specific | ||||
| #Description: linter | ||||
| #Pinned versions: 1.16.0 | ||||
| #test that import: test_typing.py, test_type_hints.py | ||||
|  | ||||
| @ -1,7 +1,7 @@ | ||||
| sphinx==5.3.0 | ||||
| #Description: This is used to generate PyTorch docs | ||||
| #Pinned versions: 5.3.0 | ||||
| -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2 | ||||
| -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2 | ||||
|  | ||||
| # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering | ||||
| # but it doesn't seem to work and hangs around idly. The initial thought that it is probably | ||||
|  | ||||
| @ -41,6 +41,7 @@ def sample_vllm_test_library(): | ||||
|                 "pytest -v -s basic_correctness/test_cumem.py", | ||||
|                 "pytest -v -s basic_correctness/test_basic_correctness.py", | ||||
|                 "pytest -v -s basic_correctness/test_cpu_offload.py", | ||||
|                 "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py", | ||||
|             ], | ||||
|         }, | ||||
|         "vllm_basic_models_test": { | ||||
| @ -67,12 +68,15 @@ def sample_vllm_test_library(): | ||||
|                         "-v", | ||||
|                         "-s", | ||||
|                         "entrypoints/llm", | ||||
|                         "--ignore=entrypoints/llm/test_lazy_outlines.py", | ||||
|                         "--ignore=entrypoints/llm/test_generate.py", | ||||
|                         "--ignore=entrypoints/llm/test_generate_multiple_loras.py", | ||||
|                         "--ignore=entrypoints/llm/test_collective_rpc.py", | ||||
|                     ] | ||||
|                 ), | ||||
|                 "pytest -v -s entrypoints/llm/test_generate.py", | ||||
|                 "pytest -v -s entrypoints/offline_mode", | ||||
|                 "pytest -v -s entrypoints/llm/test_lazy_outlines.py", | ||||
|                 "pytest -v -s entrypoints/llm/test_generate.py ", | ||||
|                 "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode", | ||||
|             ], | ||||
|         }, | ||||
|         "vllm_regression_test": { | ||||
|  | ||||
							
								
								
									
										40
									
								
								.ci/pytorch/functorch_doc_push_script.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										40
									
								
								.ci/pytorch/functorch_doc_push_script.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,40 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This is where the local pytorch install in the docker image is located | ||||
| pt_checkout="/var/lib/jenkins/workspace" | ||||
| source "$pt_checkout/.ci/pytorch/common_utils.sh" | ||||
| echo "functorch_doc_push_script.sh: Invoked with $*" | ||||
|  | ||||
| set -ex -o pipefail | ||||
|  | ||||
| version=${DOCS_VERSION:-nightly} | ||||
| echo "version: $version" | ||||
|  | ||||
| # Build functorch docs | ||||
| pushd $pt_checkout/functorch/docs | ||||
| make html | ||||
| popd | ||||
|  | ||||
| git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages | ||||
| pushd functorch_ghpages | ||||
|  | ||||
| if [ "$version" == "main" ]; then | ||||
|   version=nightly | ||||
| fi | ||||
|  | ||||
| git rm -rf "$version" || true | ||||
| mv "$pt_checkout/functorch/docs/build/html" "$version" | ||||
|  | ||||
| git add "$version" || true | ||||
| git status | ||||
| git config user.email "soumith+bot@pytorch.org" | ||||
| git config user.name "pytorchbot" | ||||
| # If there aren't changes, don't make a commit; push is no-op | ||||
| git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true | ||||
| git status | ||||
|  | ||||
| if [[ "${WITH_PUSH:-}" == true ]]; then | ||||
|   git push -u origin gh-pages | ||||
| fi | ||||
|  | ||||
| popd | ||||
| @ -1,25 +0,0 @@ | ||||
| From 6e08c9d08e9de59c7af28b720289debbbd384764 Mon Sep 17 00:00:00 2001 | ||||
| From: Michael Wang <13521008+isVoid@users.noreply.github.com> | ||||
| Date: Tue, 1 Apr 2025 17:28:05 -0700 | ||||
| Subject: [PATCH] Avoid bumping certain driver API to avoid future breakage | ||||
|  (#185) | ||||
|  | ||||
| Co-authored-by: isVoid <isVoid@users.noreply.github.com> | ||||
| --- | ||||
|  numba_cuda/numba/cuda/cudadrv/driver.py | 3 +++ | ||||
|  1 file changed, 3 insertions(+) | ||||
|  | ||||
| diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py | ||||
| index 1641bf77..233e9ed7 100644 | ||||
| --- a/numba_cuda/numba/cuda/cudadrv/driver.py | ||||
| +++ b/numba_cuda/numba/cuda/cudadrv/driver.py | ||||
| @@ -365,6 +365,9 @@ def _find_api(self, fname): | ||||
|          else: | ||||
|              variants = ('_v2', '') | ||||
|   | ||||
| +        if fname in ("cuCtxGetDevice", "cuCtxSynchronize"): | ||||
| +            return getattr(self.lib, fname) | ||||
| + | ||||
|          for variant in variants: | ||||
|              try: | ||||
|                  return getattr(self.lib, f'{fname}{variant}') | ||||
| @ -32,16 +32,6 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v | ||||
|   git config --global --add safe.directory /var/lib/jenkins/workspace | ||||
| fi | ||||
|  | ||||
|  | ||||
| # Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878 | ||||
| NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true) | ||||
| if [ -n "$NUMBA_CUDA_DIR" ]; then | ||||
|   NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch" | ||||
|   pushd "$NUMBA_CUDA_DIR" | ||||
|   patch -p4 <"$NUMBA_PATCH" | ||||
|   popd | ||||
| fi | ||||
|  | ||||
| echo "Environment variables:" | ||||
| env | ||||
|  | ||||
| @ -334,17 +324,11 @@ test_python() { | ||||
| } | ||||
|  | ||||
| test_python_smoke() { | ||||
|   # Smoke tests for H100/B200 | ||||
|   # Smoke tests for H100 | ||||
|   time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running | ||||
|   assert_git_not_dirty | ||||
| } | ||||
|  | ||||
| test_python_smoke_b200() { | ||||
|   # Targeted smoke tests for B200 - staged approach to avoid too many failures | ||||
|   time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running | ||||
|   assert_git_not_dirty | ||||
| } | ||||
|  | ||||
| test_h100_distributed() { | ||||
|   # Distributed tests at H100 | ||||
|   time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running | ||||
| @ -1556,10 +1540,14 @@ test_executorch() { | ||||
|   install_torchvision | ||||
|   install_torchaudio | ||||
|  | ||||
|   INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh" | ||||
|  | ||||
|   pushd /executorch | ||||
|   "${INSTALL_SCRIPT}" setup_executorch | ||||
|  | ||||
|   export PYTHON_EXECUTABLE=python | ||||
|   export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" | ||||
|  | ||||
|   # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch | ||||
|   # from the PR | ||||
|   bash .ci/scripts/setup-linux.sh --build-tool cmake | ||||
|  | ||||
|   echo "Run ExecuTorch unit tests" | ||||
|   pytest -v -n auto | ||||
| @ -1573,6 +1561,10 @@ test_executorch() { | ||||
|  | ||||
|   popd | ||||
|  | ||||
|   # Test torchgen generated code for Executorch. | ||||
|   echo "Testing ExecuTorch op registration" | ||||
|   "$BUILD_BIN_DIR"/test_edge_op_registration | ||||
|  | ||||
|   assert_git_not_dirty | ||||
| } | ||||
|  | ||||
| @ -1580,7 +1572,6 @@ test_linux_aarch64() { | ||||
|   python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \ | ||||
|         test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \ | ||||
|         test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \ | ||||
|         distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \ | ||||
|         --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose | ||||
|  | ||||
|   # Dynamo tests | ||||
| @ -1779,8 +1770,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then | ||||
|   test_xpu_bin | ||||
| elif [[ "${TEST_CONFIG}" == smoke ]]; then | ||||
|   test_python_smoke | ||||
| elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then | ||||
|   test_python_smoke_b200 | ||||
| elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then | ||||
|   test_h100_distributed | ||||
| elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then | ||||
|  | ||||
| @ -137,7 +137,7 @@ sccache --show-stats | ||||
| python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])" | ||||
| ( | ||||
|   if "%BUILD_ENVIRONMENT%"=="" ( | ||||
|     echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash. | ||||
|     echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash. | ||||
|   ) else ( | ||||
|     copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%" | ||||
|  | ||||
|  | ||||
| @ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" ( | ||||
| ) else ( | ||||
|   set CONDA_PARENT_DIR=C:\Jenkins | ||||
| ) | ||||
| set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3 | ||||
|  | ||||
|  | ||||
| :: Be conservative here when rolling out the new AMI with conda. This will try | ||||
| :: to install conda as before if it couldn't find the conda installation. This | ||||
| :: can be removed eventually after we gain enough confidence in the AMI | ||||
| if not exist %CONDA_ROOT_DIR% ( | ||||
| if not exist %CONDA_PARENT_DIR%\Miniconda3 ( | ||||
|   set INSTALL_FRESH_CONDA=1 | ||||
| ) | ||||
|  | ||||
| @ -17,14 +17,10 @@ if "%INSTALL_FRESH_CONDA%"=="1" ( | ||||
|   if errorlevel 1 exit /b | ||||
|   if not errorlevel 0 exit /b | ||||
|  | ||||
|   %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR% | ||||
|   %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3 | ||||
|   if errorlevel 1 exit /b | ||||
|   if not errorlevel 0 exit /b | ||||
| ) | ||||
|  | ||||
| :: Activate conda so that we can use its commands, i.e. conda, python, pip | ||||
| call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR% | ||||
| :: Activate conda so that we can use its commands, i.e. conda, python, pip | ||||
| call conda activate py_tmp | ||||
|  | ||||
| call pip install -r .ci/docker/requirements-ci.txt | ||||
| call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3 | ||||
|  | ||||
| @ -14,7 +14,7 @@ if not errorlevel 0 exit /b | ||||
| :: build\torch. Rather than changing all these references, making a copy of torch folder | ||||
| :: from conda to the current workspace is easier. The workspace will be cleaned up after | ||||
| :: the job anyway | ||||
| xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\ | ||||
| xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\ | ||||
|  | ||||
| pushd . | ||||
| if "%VC_VERSION%" == "" ( | ||||
|  | ||||
| @ -38,14 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then | ||||
| fi | ||||
|  | ||||
| # TODO: Move both of them to Windows AMI | ||||
| python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1 | ||||
|  | ||||
| # Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments | ||||
| # pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node' | ||||
| # scipy from 1.6.3 to 1.10 | ||||
| # expecttest from 0.1.3 to 0.3.0 | ||||
| # xdoctest from 1.0.2 to 1.3.0 | ||||
| python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.3.0" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" "boto3==1.35.42" | ||||
| python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1 | ||||
|  | ||||
| # Install Z3 optional dependency for Windows builds. | ||||
| python -m pip install z3-solver==4.15.1.0 | ||||
| @ -59,6 +52,9 @@ python -m pip install parameterized==0.8.1 | ||||
| # Install pulp for testing ilps under torch\distributed\_tools | ||||
| python -m pip install pulp==2.9.0 | ||||
|  | ||||
| # Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308 | ||||
| python -m pip install expecttest==0.3.0 | ||||
|  | ||||
| run_tests() { | ||||
|     # Run nvidia-smi if available | ||||
|     for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do | ||||
|  | ||||
| @ -264,7 +264,7 @@ def unzip_artifact_and_replace_files() -> None: | ||||
|         change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py") | ||||
|  | ||||
|         for file in Path(f"artifacts/dist/{old_stem}").glob( | ||||
|             "*.dist-info/*", | ||||
|             "*.dist-info/**", | ||||
|         ): | ||||
|             change_content_to_new_version(file) | ||||
|  | ||||
|  | ||||
							
								
								
									
										16
									
								
								.github/actions/setup-win/action.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										16
									
								
								.github/actions/setup-win/action.yml
									
									
									
									
										vendored
									
									
								
							| @ -6,12 +6,6 @@ inputs: | ||||
|   cuda-version: | ||||
|     description: which cuda version to install, 'cpu' for none | ||||
|     required: true | ||||
|   python-version: | ||||
|     required: false | ||||
|     type: string | ||||
|     default: "3.10" | ||||
|     description: | | ||||
|       The python version to be used. Will be 3.10 by default | ||||
|  | ||||
| runs: | ||||
|   using: composite | ||||
| @ -44,24 +38,18 @@ runs: | ||||
|         CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat" | ||||
|  | ||||
|         { | ||||
|           echo "CONDA=${CONDA}"; | ||||
|           echo "CONDA_RUN=${CONDA} run --no-capture-output"; | ||||
|           echo "CONDA_BUILD=${CONDA} run conda-build"; | ||||
|           echo "CONDA_INSTALL=${CONDA} install"; | ||||
|         } >> "${GITHUB_ENV}" | ||||
|  | ||||
|     - name: Setup Python3 | ||||
|       env: | ||||
|           PYTHON_VERSION: ${{ inputs.python-version }} | ||||
|       shell: bash | ||||
|       run: | | ||||
|         set +e | ||||
|         set -x | ||||
|  | ||||
|         # Create new py_tmp env with python-version | ||||
|         ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp | ||||
|  | ||||
|         PYTHON3=$(${CONDA_RUN} -n py_tmp which python3) | ||||
|         PYTHON3=$(${CONDA_RUN} which python3) | ||||
|         EXIT_CODE=$? | ||||
|  | ||||
|         if [[ "${EXIT_CODE}" == "0" ]]; then | ||||
| @ -74,7 +62,7 @@ runs: | ||||
|           # installation, which is Python 3 based. Its Python is default to Python 3. Further, there | ||||
|           # is also the Miniconda installation that is Python 2 based, and both can be installed if | ||||
|           # needed. In both cases, Python binary is just called python | ||||
|           PYTHON=$(${CONDA_RUN} -n py_tmp which python) | ||||
|           PYTHON=$(${CONDA_RUN} which python) | ||||
|           EXIT_CODE=$? | ||||
|  | ||||
|           if [[ "${EXIT_CODE}" == "0" ]]; then | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/ci_commit_pins/vllm.txt
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ci_commit_pins/vllm.txt
									
									
									
									
										vendored
									
									
								
							| @ -1 +1 @@ | ||||
| 090197034faf3b193c4467cedeb9281e3078892d | ||||
| 5bcc153d7bf69ef34bc5788a33f60f1792cf2861 | ||||
|  | ||||
							
								
								
									
										3
									
								
								.github/labeler.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.github/labeler.yml
									
									
									
									
										vendored
									
									
								
							| @ -130,6 +130,3 @@ | ||||
| - torch/csrc/inductor/aoti_include/** | ||||
| - torchgen/aoti/** | ||||
| - torchgen/gen_aoti_c_shim.py | ||||
|  | ||||
| "ciflow/vllm": | ||||
| - .github/ci_commit_pins/vllm.txt | ||||
|  | ||||
							
								
								
									
										1
									
								
								.github/pytorch-probot.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/pytorch-probot.yml
									
									
									
									
										vendored
									
									
								
							| @ -36,7 +36,6 @@ ciflow_push_tags: | ||||
| - ciflow/win-arm64 | ||||
| - ciflow/h100-symm-mem | ||||
| - ciflow/h100-cutlass-backend | ||||
| - ciflow/b200 | ||||
| retryable_workflows: | ||||
| - pull | ||||
| - trunk | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/scripts/generate_ci_workflows.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/scripts/generate_ci_workflows.py
									
									
									
									
										vendored
									
									
								
							| @ -135,7 +135,7 @@ ROCM_SMOKE_WORKFLOWS = [ | ||||
|         build_configs=generate_binary_build_matrix.generate_wheels_matrix( | ||||
|             OperatingSystem.LINUX, | ||||
|             arches=["6.4"], | ||||
|             python_versions=["3.10"], | ||||
|             python_versions=["3.9"], | ||||
|         ), | ||||
|         ciflow_config=CIFlowConfig( | ||||
|             labels={ | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/_binary-test-linux.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/_binary-test-linux.yml
									
									
									
									
										vendored
									
									
								
							| @ -187,6 +187,8 @@ jobs: | ||||
|  | ||||
|       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG | ||||
|         uses: pytorch/test-infra/.github/actions/setup-nvidia@main | ||||
|         with: | ||||
|           driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }} | ||||
|         if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }} | ||||
|  | ||||
|       - name: configure aws credentials | ||||
|  | ||||
							
								
								
									
										14
									
								
								.github/workflows/_docs.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										14
									
								
								.github/workflows/_docs.yml
									
									
									
									
										vendored
									
									
								
							| @ -75,6 +75,10 @@ jobs: | ||||
|             runner: ${{ inputs.runner_prefix }}linux.2xlarge | ||||
|             # It takes less than 30m to finish python docs unless there are issues | ||||
|             timeout-minutes: 30 | ||||
|           - docs_type: functorch | ||||
|             runner: ${{ inputs.runner_prefix }}linux.2xlarge | ||||
|             # It takes less than 15m to finish functorch docs unless there are issues | ||||
|             timeout-minutes: 15 | ||||
|     # Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180) | ||||
|     # The current name requires updating the database last docs push query from test-infra every time the matrix is updated | ||||
|     name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }} | ||||
| @ -207,6 +211,16 @@ jobs: | ||||
|           path: cppdocs/ | ||||
|           s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs | ||||
|  | ||||
|       - name: Upload functorch Docs Preview | ||||
|         uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0 | ||||
|         if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }} | ||||
|         with: | ||||
|           retention-days: 14 | ||||
|           s3-bucket: doc-previews | ||||
|           if-no-files-found: error | ||||
|           path: functorch_ghpages/nightly/ | ||||
|           s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs | ||||
|  | ||||
|       - name: Teardown Linux | ||||
|         uses: pytorch/test-infra/.github/actions/teardown-linux@main | ||||
|         if: always() | ||||
|  | ||||
							
								
								
									
										28
									
								
								.github/workflows/_get-changed-files.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										28
									
								
								.github/workflows/_get-changed-files.yml
									
									
									
									
										vendored
									
									
								
							| @ -2,12 +2,6 @@ name: Get Changed Files | ||||
|  | ||||
| on: | ||||
|   workflow_call: | ||||
|     inputs: | ||||
|       all_files: | ||||
|         description: "Whether to return all files instead of just changed files" | ||||
|         required: false | ||||
|         type: boolean | ||||
|         default: false | ||||
|     outputs: | ||||
|       changed-files: | ||||
|         description: "List of changed files (space-separated) or '*' if not in a PR" | ||||
| @ -32,23 +26,17 @@ jobs: | ||||
|             # Get the PR number from the github context | ||||
|             PR_NUMBER="${{ github.event.number }}" | ||||
|  | ||||
|             # Check if all_files is requested | ||||
|             if [ "${{ inputs.all_files }}" = "true" ]; then | ||||
|               echo "all_files input is true, returning all files" | ||||
|               echo "changed-files=*" >> "$GITHUB_OUTPUT" | ||||
|             else | ||||
|               # Use gh CLI to get changed files in the PR with explicit repo | ||||
|               CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//') | ||||
|             # Use gh CLI to get changed files in the PR with explicit repo | ||||
|             CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//') | ||||
|  | ||||
|               if [ -z "$CHANGED_FILES" ]; then | ||||
|                 echo "No changed files found, setting to '*'" | ||||
|                 CHANGED_FILES="*" | ||||
|               fi | ||||
|  | ||||
|               echo "Changed files: $CHANGED_FILES" | ||||
|               echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT" | ||||
|             if [ -z "$CHANGED_FILES" ]; then | ||||
|               echo "No changed files found, setting to '*'" | ||||
|               CHANGED_FILES="*" | ||||
|             fi | ||||
|  | ||||
|             echo "Changed files: $CHANGED_FILES" | ||||
|             echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT" | ||||
|  | ||||
|           else | ||||
|             echo "Not in PR context, setting changed files to '*'" | ||||
|             echo "changed-files=*" >> "$GITHUB_OUTPUT" | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/_linux-test.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/_linux-test.yml
									
									
									
									
										vendored
									
									
								
							| @ -169,7 +169,7 @@ jobs: | ||||
|         id: install-nvidia-driver | ||||
|         uses: pytorch/test-infra/.github/actions/setup-nvidia@main | ||||
|         with: | ||||
|           driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }} | ||||
|           driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }} | ||||
|         if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }} | ||||
|  | ||||
|       - name: Setup GPU_FLAG for docker run | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/_win-build.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/_win-build.yml
									
									
									
									
										vendored
									
									
								
							| @ -151,7 +151,7 @@ jobs: | ||||
|           BUILD_WHEEL: 1 | ||||
|           MAX_JOBS: 8 | ||||
|           CUDA_VERSION: ${{ inputs.cuda-version }} | ||||
|           PYTHON_VERSION: "3.10" | ||||
|           PYTHON_VERSION: "3.9" | ||||
|           SCCACHE_BUCKET: "ossci-compiler-cache" | ||||
|           SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }} | ||||
|           SCCACHE_REGION: us-east-1 | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/_win-test.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/_win-test.yml
									
									
									
									
										vendored
									
									
								
							| @ -184,7 +184,7 @@ jobs: | ||||
|         env: | ||||
|           USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }} | ||||
|           INSTALL_WINDOWS_SDK: 1 | ||||
|           PYTHON_VERSION: "3.10" | ||||
|           PYTHON_VERSION: 3.9 | ||||
|           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} | ||||
|           VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} | ||||
|           TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }} | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/build-vllm-wheel.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/build-vllm-wheel.yml
									
									
									
									
										vendored
									
									
								
							| @ -178,7 +178,7 @@ jobs: | ||||
|       contents: read | ||||
|     container: | ||||
|       image: continuumio/miniconda3:4.12.0 | ||||
|     environment: ${{ ((github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && 'nightly-wheel-upload' || '' }} | ||||
|     environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }} | ||||
|     steps: | ||||
|       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | ||||
|  | ||||
|  | ||||
							
								
								
									
										5
									
								
								.github/workflows/docker-builds.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										5
									
								
								.github/workflows/docker-builds.yml
									
									
									
									
										vendored
									
									
								
							| @ -70,8 +70,9 @@ jobs: | ||||
|           pytorch-linux-jammy-py3-clang18-asan, | ||||
|           pytorch-linux-jammy-py3-clang12-onnx, | ||||
|           pytorch-linux-jammy-linter, | ||||
|           pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter, | ||||
|           pytorch-linux-jammy-py3-clang12-executorch, | ||||
|           pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter, | ||||
|           # Executorch pin needs update | ||||
|           # pytorch-linux-jammy-py3-clang12-executorch, | ||||
|           pytorch-linux-jammy-py3.12-triton-cpu, | ||||
|           pytorch-linux-noble-riscv64-py3.12-gcc14 | ||||
|         ] | ||||
|  | ||||
							
								
								
									
										14
									
								
								.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										14
									
								
								.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -44,7 +44,7 @@ jobs: | ||||
|       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} | ||||
|       curr_branch: ${{ github.head_ref || github.ref_name }} | ||||
|       curr_ref_type: ${{ github.ref_type }} | ||||
|   manywheel-py3_10-rocm6_4-build: | ||||
|   manywheel-py3_9-rocm6_4-build: | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     uses: ./.github/workflows/_binary-build-linux.yml | ||||
|     needs: get-label-type | ||||
| @ -58,16 +58,16 @@ jobs: | ||||
|       GPU_ARCH_TYPE: rocm | ||||
|       DOCKER_IMAGE: manylinux2_28-builder | ||||
|       DOCKER_IMAGE_TAG_PREFIX: rocm6.4 | ||||
|       DESIRED_PYTHON: "3.10" | ||||
|       DESIRED_PYTHON: "3.9" | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build_name: manywheel-py3_10-rocm6_4 | ||||
|       build_name: manywheel-py3_9-rocm6_4 | ||||
|       build_environment: linux-binary-manywheel-rocm | ||||
|     secrets: | ||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||
|   manywheel-py3_10-rocm6_4-test:  # Testing | ||||
|   manywheel-py3_9-rocm6_4-test:  # Testing | ||||
|     if: ${{ github.repository_owner == 'pytorch' }} | ||||
|     needs: | ||||
|       - manywheel-py3_10-rocm6_4-build | ||||
|       - manywheel-py3_9-rocm6_4-build | ||||
|       - get-label-type | ||||
|     runs-on: linux.rocm.gpu.mi250 | ||||
|     timeout-minutes: 240 | ||||
| @ -82,14 +82,14 @@ jobs: | ||||
|       SKIP_ALL_TESTS: 1 | ||||
|       DOCKER_IMAGE: manylinux2_28-builder | ||||
|       DOCKER_IMAGE_TAG_PREFIX: rocm6.4 | ||||
|       DESIRED_PYTHON: "3.10" | ||||
|       DESIRED_PYTHON: "3.9" | ||||
|     steps: | ||||
|       - name: Setup ROCm | ||||
|         uses: ./.github/actions/setup-rocm | ||||
|       - uses: actions/download-artifact@v4.1.7 | ||||
|         name: Download Build Artifacts | ||||
|         with: | ||||
|           name: manywheel-py3_10-rocm6_4 | ||||
|           name: manywheel-py3_9-rocm6_4 | ||||
|           path: "${{ runner.temp }}/artifacts/" | ||||
|       - name: Checkout PyTorch | ||||
|         uses: actions/checkout@v4 | ||||
|  | ||||
							
								
								
									
										8
									
								
								.github/workflows/lint.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										8
									
								
								.github/workflows/lint.yml
									
									
									
									
										vendored
									
									
								
							| @ -31,8 +31,6 @@ jobs: | ||||
|     if: github.repository_owner == 'pytorch' | ||||
|     name: Get changed files | ||||
|     uses: ./.github/workflows/_get-changed-files.yml | ||||
|     with: | ||||
|       all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') }} | ||||
|  | ||||
|   lintrunner-clang: | ||||
|     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main | ||||
| @ -55,7 +53,7 @@ jobs: | ||||
|     with: | ||||
|       timeout: 120 | ||||
|       runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" | ||||
|       docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter | ||||
|       docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter | ||||
|       # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout | ||||
|       # to run git rev-parse HEAD~:.ci/docker when a new image is needed | ||||
|       fetch-depth: 0 | ||||
| @ -266,10 +264,10 @@ jobs: | ||||
|         with: | ||||
|           submodules: false | ||||
|           fetch-depth: 1 | ||||
|       - name: Setup Python 3.10 | ||||
|       - name: Setup Python 3.9 | ||||
|         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 | ||||
|         with: | ||||
|           python-version: '3.10' | ||||
|           python-version: '3.9' | ||||
|           architecture: x64 | ||||
|           cache: pip | ||||
|       - name: Install dependencies | ||||
|  | ||||
							
								
								
									
										26
									
								
								.github/workflows/pull.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										26
									
								
								.github/workflows/pull.yml
									
									
									
									
										vendored
									
									
								
							| @ -316,6 +316,32 @@ jobs: | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-py3-clang12-executorch-build: | ||||
|     if: false  # Docker build needs pin update | ||||
|     name: linux-jammy-py3-clang12-executorch | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build-environment: linux-jammy-py3-clang12-executorch | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-py3-clang12-executorch-test: | ||||
|     name: linux-jammy-py3-clang12-executorch | ||||
|     uses: ./.github/workflows/_linux-test.yml | ||||
|     needs: linux-jammy-py3-clang12-executorch-build | ||||
|     if: false # Has been broken for a while | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3-clang12-executorch | ||||
|       docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }} | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: | ||||
|     name: cuda12.8-py3.10-gcc9-sm75 | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|  | ||||
							
								
								
									
										76
									
								
								.github/workflows/test-b200.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										76
									
								
								.github/workflows/test-b200.yml
									
									
									
									
										vendored
									
									
								
							| @ -1,76 +0,0 @@ | ||||
| # B200 Smoke Tests CI Workflow | ||||
| # | ||||
| # This workflow runs smoke tests on B200 hardware | ||||
| # | ||||
| # Flow: | ||||
| # 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200 | ||||
| # 2. Runs smoke tests on linux.dgx.b200 runner | ||||
| # 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function | ||||
| # | ||||
| # Triggered by: | ||||
| # - Pull requests modifying this workflow file | ||||
| # - Manual dispatch | ||||
| # - Schedule (every 6 hours) | ||||
| # - Adding ciflow/b200 label to a PR (creates ciflow/b200/* tag) | ||||
|  | ||||
| name: B200 Smoke Tests | ||||
|  | ||||
| on: | ||||
|   pull_request: | ||||
|     paths: | ||||
|       - .github/workflows/test-b200.yml | ||||
|   workflow_dispatch: | ||||
|   schedule: | ||||
|     - cron: 0 4,10,16,22 * * *  # every 6 hours | ||||
|   push: | ||||
|     tags: | ||||
|       - ciflow/b200/* | ||||
|  | ||||
| concurrency: | ||||
|   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | ||||
|   cancel-in-progress: true | ||||
|  | ||||
| permissions: | ||||
|   id-token: write | ||||
|   contents: read | ||||
|  | ||||
| jobs: | ||||
|  | ||||
|   get-label-type: | ||||
|     if: github.repository_owner == 'pytorch' | ||||
|     name: get-label-type | ||||
|     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main | ||||
|     with: | ||||
|       triggering_actor: ${{ github.triggering_actor }} | ||||
|       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} | ||||
|       curr_branch: ${{ github.head_ref || github.ref_name }} | ||||
|       curr_ref_type: ${{ github.ref_type }} | ||||
|  | ||||
|   linux-jammy-cuda12_8-py3_10-gcc11-sm100-build: | ||||
|     name: linux-jammy-cuda12.8-py3.10-gcc11-sm100 | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       runner: linux.12xlarge.memory | ||||
|       build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100 | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 | ||||
|       cuda-arch-list: '10.0' | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "smoke_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" }, | ||||
|         ]} | ||||
|       # config: "smoke_b200" maps to test_python_smoke_b200() in .ci/pytorch/test.sh | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-cuda12_8-py3_10-gcc11-sm100-test: | ||||
|     name: linux-jammy-cuda12.8-py3.10-gcc11-sm100 | ||||
|     uses: ./.github/workflows/_linux-test.yml | ||||
|     needs: | ||||
|       - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build | ||||
|     with: | ||||
|       build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100 | ||||
|       docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }} | ||||
|       aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only | ||||
|     secrets: inherit | ||||
							
								
								
									
										24
									
								
								.github/workflows/trunk.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										24
									
								
								.github/workflows/trunk.yml
									
									
									
									
										vendored
									
									
								
							| @ -259,27 +259,3 @@ jobs: | ||||
|       docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }} | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-py3-clang12-executorch-build: | ||||
|     name: linux-jammy-py3-clang12-executorch | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build-environment: linux-jammy-py3-clang12-executorch | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-py3-clang12-executorch-test: | ||||
|     name: linux-jammy-py3-clang12-executorch | ||||
|     uses: ./.github/workflows/_linux-test.yml | ||||
|     needs: linux-jammy-py3-clang12-executorch-build | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3-clang12-executorch | ||||
|       docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }} | ||||
|     secrets: inherit | ||||
|  | ||||
							
								
								
									
										24
									
								
								.github/workflows/unstable.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										24
									
								
								.github/workflows/unstable.yml
									
									
									
									
										vendored
									
									
								
							| @ -53,3 +53,27 @@ jobs: | ||||
|       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} | ||||
|       curr_branch: ${{ github.head_ref || github.ref_name }} | ||||
|       curr_ref_type: ${{ github.ref_type }} | ||||
|  | ||||
|   linux-jammy-py3_9-clang9-xla-build: | ||||
|     name: linux-jammy-py3_9-clang9-xla | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build-environment: linux-jammy-py3.9-clang9-xla | ||||
|       docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" }, | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-py3_9-clang9-xla-test: | ||||
|     name: linux-jammy-py3_9-clang9-xla | ||||
|     uses: ./.github/workflows/_linux-test.yml | ||||
|     needs: linux-jammy-py3_9-clang9-xla-build | ||||
|     with: | ||||
|       build-environment: linux-jammy-py3.9-clang9-xla | ||||
|       docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }} | ||||
|     secrets: inherit | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/vllm.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/vllm.yml
									
									
									
									
										vendored
									
									
								
							| @ -36,8 +36,6 @@ jobs: | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       # When building vLLM, uv doesn't like that we rename wheel without changing the wheel metadata | ||||
|       allow-reuse-old-whl: false | ||||
|       build-additional-packages: "vision audio" | ||||
|       build-external-packages: "vllm" | ||||
|       build-environment: linux-jammy-cuda12.8-py3.12-gcc11 | ||||
|  | ||||
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -259,9 +259,6 @@ gen | ||||
| .pytest_cache | ||||
| aten/build/* | ||||
|  | ||||
| # Linker scripts for prioritized text optimization | ||||
| cmake/linker_script.ld | ||||
|  | ||||
| # Bram | ||||
| plsdontbreak | ||||
|  | ||||
|  | ||||
| @ -123,7 +123,6 @@ is_formatter = true | ||||
| code = 'MYPY' | ||||
| include_patterns = [ | ||||
|     'setup.py', | ||||
|     'functorch/dim/**/*.py', | ||||
|     'torch/**/*.py', | ||||
|     'torch/**/*.pyi', | ||||
|     'caffe2/**/*.py', | ||||
| @ -196,7 +195,6 @@ exclude_patterns = [ | ||||
|     'tools/test/gen_operators_yaml_test.py', | ||||
|     'tools/test/gen_oplist_test.py', | ||||
|     'tools/test/test_selective_build.py', | ||||
|     'tools/experimental/dynamic_shapes/torchfuzz/**', | ||||
| ] | ||||
| command = [ | ||||
|     'python3', | ||||
| @ -966,6 +964,7 @@ exclude_patterns = [ | ||||
|     'test/jit/**',  # should be run through test/test_jit.py | ||||
|     'test/ao/sparsity/**',  # should be run through test/test_ao_sparsity.py | ||||
|     'test/fx/**',  # should be run through test/test_fx.py | ||||
|     'test/bottleneck_test/**',  # excluded by test/run_test.py | ||||
|     'test/package/**',  # excluded by test/run_test.py | ||||
|     'test/distributed/argparse_util_test.py', | ||||
|     'test/distributed/bin/test_script.py', | ||||
| @ -1411,6 +1410,8 @@ exclude_patterns = [ | ||||
|     'torch/utils/benchmark/utils/timer.py', | ||||
|     'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py', | ||||
|     'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py', | ||||
|     'torch/utils/bottleneck/__init__.py', | ||||
|     'torch/utils/bottleneck/__main__.py', | ||||
|     'torch/utils/bundled_inputs.py', | ||||
|     'torch/utils/checkpoint.py', | ||||
|     'torch/utils/collect_env.py', | ||||
|  | ||||
| @ -1,4 +1,5 @@ | ||||
| cmake_minimum_required(VERSION 3.27 FATAL_ERROR) | ||||
| # cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW) | ||||
|  | ||||
| # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this | ||||
| # sometimes makes XCode C compiler gets detected as "Clang", even when the C++ | ||||
| @ -379,13 +380,6 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler" | ||||
|                        OFF "USE_CUDA" OFF) | ||||
| cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON | ||||
|                         "CPU_AARCH64" OFF) | ||||
| # prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le. | ||||
| set(USE_PRIORITIZED_TEXT_DEFAULT OFF) | ||||
| if(LINUX AND CPU_AARCH64) | ||||
|   set(USE_PRIORITIZED_TEXT_DEFAULT ON) | ||||
| endif() | ||||
| cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld." | ||||
|   "${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF) | ||||
|  | ||||
| option(USE_MIMALLOC "Use mimalloc" OFF) | ||||
| # Enable third party mimalloc library to improve memory allocation performance | ||||
| @ -663,11 +657,6 @@ endif(MSVC) | ||||
|  | ||||
| string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all") | ||||
|  | ||||
| # Set linker max-page-size to 64KiB on AArch64 Linux | ||||
| if(LINUX AND CPU_AARCH64) | ||||
|   add_link_options_if_supported("-z,max-page-size=0x10000") | ||||
| endif() | ||||
|  | ||||
| # Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not | ||||
| # applicable to mobile are disabled by this variable. Setting | ||||
| # `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it | ||||
| @ -902,7 +891,7 @@ IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH) | ||||
| endif() | ||||
|  | ||||
| # Set USE_FBGEMM_GENAI to ON for CUDA build on SM100. | ||||
| if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32) | ||||
| if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) | ||||
|   message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a") | ||||
|   set(USE_FBGEMM_GENAI ON) | ||||
| endif() | ||||
| @ -1432,57 +1421,3 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA) | ||||
|   install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas" | ||||
|           DESTINATION "${CMAKE_INSTALL_BINDIR}") | ||||
| endif() | ||||
|  | ||||
| if(USE_PRIORITIZED_TEXT_FOR_LD) | ||||
|   add_compile_options( | ||||
|     $<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections> | ||||
|     $<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections> | ||||
|   ) | ||||
|   set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld") | ||||
|   set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt") | ||||
|  | ||||
|   add_custom_command( | ||||
|     OUTPUT "${LINKER_SCRIPT_FILE_OUT}" | ||||
|     COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}" | ||||
|     DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}" | ||||
|     COMMENT "Generating prioritized text linker files" | ||||
|     VERBATIM | ||||
|   ) | ||||
|  | ||||
|   add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}") | ||||
|  | ||||
|   if(BUILD_PYTHON) | ||||
|     set(LINKER_OPT_TARGETS torch_python) | ||||
|   endif() | ||||
|  | ||||
|   if(NOT BUILD_LIBTORCHLESS) | ||||
|     list(APPEND LINKER_OPT_TARGETS torch_cpu c10) | ||||
|     if(USE_CUDA) | ||||
|       list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda) | ||||
|     endif() | ||||
|     if(USE_XPU) | ||||
|       list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu) | ||||
|     endif() | ||||
|     if(USE_ROCM) | ||||
|       list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip) | ||||
|     endif() | ||||
|   endif() | ||||
|  | ||||
|   foreach(tgt IN LISTS LINKER_OPT_TARGETS) | ||||
|     if(TARGET ${tgt}) | ||||
|       add_dependencies("${tgt}" generate_linker_script) | ||||
|       target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}") | ||||
|       set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}") | ||||
|     else() | ||||
|        message(WARNING "Requested target '${tgt}' for linker script optimization was not found.") | ||||
|     endif() | ||||
|   endforeach() | ||||
|  | ||||
| else() | ||||
|   if(LINUX AND CPU_AARCH64) | ||||
|     message(WARNING [[ | ||||
|     It is strongly recommend to enable linker script optimization for all AArch64 Linux builds. | ||||
|     To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1 | ||||
|     ]]) | ||||
|   endif() | ||||
| endif() | ||||
|  | ||||
| @ -317,20 +317,10 @@ IF(USE_FBGEMM_GENAI) | ||||
|         -greedy-reverse-local-assignment=1 | ||||
|         -fhip-new-launch-api) | ||||
|  | ||||
|       # Only compile for gfx942 for now. | ||||
|       # This is rather hacky, I could not figure out a clean solution :( | ||||
|       set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS}) | ||||
|       string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}") | ||||
|       if("gfx942" IN_LIST PYTORCH_ROCM_ARCH) | ||||
|         list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;) | ||||
|       endif() | ||||
|       set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS}) | ||||
|  | ||||
|       hip_add_library( | ||||
|         fbgemm_genai STATIC | ||||
|         ${fbgemm_genai_native_rocm_hip} | ||||
|         HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS}) | ||||
|       set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL}) | ||||
|       set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON) | ||||
|       target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES) | ||||
|  | ||||
|  | ||||
| @ -65,24 +65,14 @@ DLDataType getDLDataType(const Tensor& t) { | ||||
|       break; | ||||
|     // TODO(#146647): use macro here instead of spelling out each shell dtype | ||||
|     case ScalarType::Float8_e5m2: | ||||
|       dtype.code = DLDataTypeCode::kDLFloat8_e5m2; | ||||
|       break; | ||||
|     case ScalarType::Float8_e5m2fnuz: | ||||
|       dtype.code = DLDataTypeCode::kDLFloat8_e5m2fnuz; | ||||
|       break; | ||||
|     case ScalarType::Float8_e4m3fn: | ||||
|       dtype.code = DLDataTypeCode::kDLFloat8_e4m3fn; | ||||
|       break; | ||||
|     case ScalarType::Float8_e4m3fnuz: | ||||
|       dtype.code = DLDataTypeCode::kDLFloat8_e4m3fnuz; | ||||
|       break; | ||||
|     case ScalarType::Float8_e8m0fnu: | ||||
|       dtype.code = DLDataTypeCode::kDLFloat8_e8m0fnu; | ||||
|       TORCH_CHECK_BUFFER(false, "float8 types are not supported by dlpack"); | ||||
|       break; | ||||
|     case ScalarType::Float4_e2m1fn_x2: | ||||
|       dtype.code = DLDataTypeCode::kDLFloat4_e2m1fn; | ||||
|       dtype.lanes = 2; | ||||
|       dtype.bits = 4; | ||||
|       TORCH_CHECK_BUFFER(false, "float4 types are not supported by dlpack"); | ||||
|       break; | ||||
|     case ScalarType::QInt8: | ||||
|     case ScalarType::QUInt8: | ||||
| @ -187,11 +177,7 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat | ||||
|  | ||||
| ScalarType toScalarType(const DLDataType& dtype) { | ||||
|   ScalarType stype = ScalarType::Undefined; | ||||
|   if (dtype.code != DLDataTypeCode::kDLFloat4_e2m1fn) { | ||||
|     TORCH_CHECK_BUFFER( | ||||
|         dtype.lanes == 1, | ||||
|         "ATen does not support lanes != 1 for dtype code", std::to_string(dtype.code)); | ||||
|   } | ||||
|   TORCH_CHECK_BUFFER(dtype.lanes == 1, "ATen does not support lanes != 1"); | ||||
|   switch (dtype.code) { | ||||
|     case DLDataTypeCode::kDLUInt: | ||||
|       switch (dtype.bits) { | ||||
| @ -283,73 +269,6 @@ ScalarType toScalarType(const DLDataType& dtype) { | ||||
|               false, "Unsupported kDLBool bits ", std::to_string(dtype.bits)); | ||||
|       } | ||||
|       break; | ||||
|     case DLDataTypeCode::kDLFloat8_e5m2: | ||||
|       switch (dtype.bits) { | ||||
|         case 8: | ||||
|           stype = ScalarType::Float8_e5m2; | ||||
|           break; | ||||
|         default: | ||||
|           TORCH_CHECK_BUFFER( | ||||
|               false, "Unsupported kDLFloat8_e5m2 bits ", std::to_string(dtype.bits)); | ||||
|       } | ||||
|       break; | ||||
|     case DLDataTypeCode::kDLFloat8_e5m2fnuz: | ||||
|       switch (dtype.bits) { | ||||
|         case 8: | ||||
|           stype = ScalarType::Float8_e5m2fnuz; | ||||
|           break; | ||||
|         default: | ||||
|           TORCH_CHECK_BUFFER( | ||||
|               false, "Unsupported kDLFloat8_e5m2fnuz bits ", std::to_string(dtype.bits)); | ||||
|       } | ||||
|       break; | ||||
|     case DLDataTypeCode::kDLFloat8_e4m3fn: | ||||
|       switch (dtype.bits) { | ||||
|         case 8: | ||||
|           stype = ScalarType::Float8_e4m3fn; | ||||
|           break; | ||||
|         default: | ||||
|           TORCH_CHECK_BUFFER( | ||||
|               false, "Unsupported kDLFloat8_e4m3fn bits ", std::to_string(dtype.bits)); | ||||
|       } | ||||
|       break; | ||||
|     case DLDataTypeCode::kDLFloat8_e4m3fnuz: | ||||
|       switch (dtype.bits) { | ||||
|         case 8: | ||||
|           stype = ScalarType::Float8_e4m3fnuz; | ||||
|           break; | ||||
|         default: | ||||
|           TORCH_CHECK_BUFFER( | ||||
|               false, "Unsupported kDLFloat8_e4m3fnuz bits ", std::to_string(dtype.bits)); | ||||
|       } | ||||
|       break; | ||||
|     case DLDataTypeCode::kDLFloat8_e8m0fnu: | ||||
|       switch (dtype.bits) { | ||||
|         case 8: | ||||
|           stype = ScalarType::Float8_e8m0fnu; | ||||
|           break; | ||||
|         default: | ||||
|           TORCH_CHECK_BUFFER( | ||||
|               false, "Unsupported kDLFloat8_e8m0fnu bits ", std::to_string(dtype.bits)); | ||||
|       } | ||||
|       break; | ||||
|     case DLDataTypeCode::kDLFloat4_e2m1fn: | ||||
|       switch (dtype.bits) { | ||||
|         case 4: | ||||
|           switch (dtype.lanes) { | ||||
|             case 2: | ||||
|               stype = ScalarType::Float4_e2m1fn_x2; | ||||
|               break; | ||||
|             default: | ||||
|               TORCH_CHECK_BUFFER( | ||||
|                 false, "Unsupported kDLFloat4_e2m1fn lanes ", std::to_string(dtype.lanes)); | ||||
|           } | ||||
|           break; | ||||
|         default: | ||||
|           TORCH_CHECK_BUFFER( | ||||
|               false, "Unsupported kDLFloat4_e2m1fn bits ", std::to_string(dtype.bits)); | ||||
|       } | ||||
|       break; | ||||
|     default: | ||||
|       TORCH_CHECK_BUFFER(false, "Unsupported code ", std::to_string(dtype.code)); | ||||
|   } | ||||
| @ -401,13 +320,30 @@ T* toDLPackImpl(const Tensor& src) { | ||||
|   // The following code detects whether the src follows | ||||
|   // a continuous pattern. If the src follows such pattern (common-case) | ||||
|   // then we do not need to normalize the strides. | ||||
|   bool need_normalize_strides = src.dim() == 1 && src.size(0) == 1 && src.stride(0) != 1; | ||||
|   bool need_normalize_strides = false; | ||||
|   int64_t expected_stride = 1; | ||||
|   for (int i = src.dim() - 1; i >= 0; i--) { | ||||
|     // detect if we do not meet continuous pattern | ||||
|     // and the size is 1, so there is opportunity to normalize | ||||
|     if (src.stride(i) != expected_stride && src.size(i) == 1) { | ||||
|       need_normalize_strides = true; | ||||
|       break; | ||||
|     } | ||||
|     expected_stride *= src.size(i); | ||||
|   } | ||||
|  | ||||
|   // less common case, try normalizing the strides | ||||
|   if (need_normalize_strides) { | ||||
|     // create a new tensor with possibly normalized strides | ||||
|     // gh-83069 | ||||
|     auto shape = src.sizes(); | ||||
|     view = src.as_strided(shape, {1}, src.storage_offset()); | ||||
|     auto strides = src.strides().vec(); | ||||
|     for (int i = 0; i < src.dim(); i++) { | ||||
|       if (shape[i] < 2) { | ||||
|         strides[i] = 1; | ||||
|       } | ||||
|     } | ||||
|     view = src.as_strided(shape, strides, src.storage_offset()); | ||||
|   } | ||||
|  | ||||
|   ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>); | ||||
| @ -418,8 +354,8 @@ T* toDLPackImpl(const Tensor& src) { | ||||
|   atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device()); | ||||
|   atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim()); | ||||
|   atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src); | ||||
|   atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(view.sizes().data()); | ||||
|   atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(view.strides().data()); | ||||
|   atDLMTensor->tensor.dl_tensor.shape = view.sizes().data(); | ||||
|   atDLMTensor->tensor.dl_tensor.strides = view.strides().data(); | ||||
|   atDLMTensor->tensor.dl_tensor.byte_offset = 0; | ||||
|   fillVersion(&atDLMTensor->tensor); | ||||
|  | ||||
|  | ||||
| @ -1637,7 +1637,9 @@ bool gemm_and_bias( | ||||
|   if (activation == GEMMAndBiasActivationEpilogue::RELU) { | ||||
|     epilogue = CUBLASLT_EPILOGUE_RELU_BIAS; | ||||
|   } else if (activation == GEMMAndBiasActivationEpilogue::GELU) { | ||||
| #if CUDA_VERSION >= 11040 || defined(USE_ROCM) | ||||
|     epilogue = CUBLASLT_EPILOGUE_GELU_BIAS; | ||||
| #endif | ||||
|   } | ||||
|  | ||||
|   if (bias != nullptr) { | ||||
| @ -1929,6 +1931,7 @@ void scaled_gemm( | ||||
|     bool use_fast_accum) { | ||||
|   // Note: see `cublasCommonArgs` for various non-intuitive manupulations | ||||
|   // of input arguments to this function. | ||||
| #if CUDA_VERSION >= 11080 || defined(USE_ROCM) | ||||
|   const auto computeType = CUBLAS_COMPUTE_32F; | ||||
|   const auto scaleType = CUDA_R_32F; | ||||
|   const float alpha_val = 1.0; | ||||
| @ -1951,8 +1954,8 @@ void scaled_gemm( | ||||
|   #if ROCM_VERSION >= 70000 | ||||
|             if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) { | ||||
|                 // TODO: add constraints based on hipblaslt internals | ||||
|                 TORCH_CHECK((m % 16 == 0) && (n % 16 == 0) && (k % 128 == 0), | ||||
|                            "M, N must be multiples of 16 and K should be multiple of 128 for MX format. " | ||||
|                 TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0), | ||||
|                            "Matrix dimensions must be multiples of 32 for MX format. " | ||||
|                            "Got m=", m, ", n=", n, ", k=", k); | ||||
|             } | ||||
|   #endif | ||||
| @ -2130,6 +2133,8 @@ void scaled_gemm( | ||||
|       " scaleType ", | ||||
|       scaleType); | ||||
|   return; | ||||
| #endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM) | ||||
|   TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above"); | ||||
| } | ||||
|  | ||||
| void int8_gemm( | ||||
|  | ||||
| @ -266,14 +266,11 @@ CUDAGeneratorImpl::CUDAGeneratorImpl( | ||||
|  * See Note [Acquire lock when using random generators] | ||||
|  */ | ||||
| void CUDAGeneratorImpl::set_current_seed(uint64_t seed) { | ||||
|   if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) { | ||||
|     state_->seed_ = seed; | ||||
|     state_->philox_offset_per_thread_ = 0; | ||||
|     no_reset_rnn_state_.clear(); | ||||
|   } else { | ||||
|     TORCH_CHECK(state_->seed_ == seed, "CUDAGeneratorImpl::set_current_seed can be called during stream capture only if new seed is the same as the original seed."); | ||||
|     // no-op case | ||||
|   } | ||||
|   at::cuda::assertNotCapturing( | ||||
|       "Cannot call CUDAGeneratorImpl::set_current_seed"); | ||||
|   state_->seed_ = seed; | ||||
|   state_->philox_offset_per_thread_ = 0; | ||||
|   no_reset_rnn_state_.clear(); | ||||
| } | ||||
|  | ||||
| /** | ||||
| @ -302,6 +299,9 @@ uint64_t CUDAGeneratorImpl::get_offset() const { | ||||
|  * Gets the current seed of CUDAGeneratorImpl. | ||||
|  */ | ||||
| uint64_t CUDAGeneratorImpl::current_seed() const { | ||||
|   // Debatable if current_seed() should be allowed in captured regions. | ||||
|   // Conservatively disallow it for now. | ||||
|   at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::current_seed"); | ||||
|   return state_->seed_; | ||||
| } | ||||
|  | ||||
| @ -346,6 +346,8 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const { | ||||
|  * and size of the internal state. | ||||
|  */ | ||||
| void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) { | ||||
|   at::cuda::assertNotCapturing( | ||||
|       "Please ensure to utilize the CUDAGeneratorImpl::set_state_index method during capturing."); | ||||
|   static const size_t seed_size = sizeof(uint64_t); | ||||
|   static const size_t offset_size = sizeof(int64_t); | ||||
|   static const size_t total_size = seed_size + offset_size; | ||||
| @ -400,27 +402,15 @@ c10::intrusive_ptr<c10::GeneratorImpl> CUDAGeneratorImpl::graphsafe_get_state() | ||||
|  */ | ||||
| void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) { | ||||
|   // see Note [Why enforce RNG offset % 4 == 0?] | ||||
|  | ||||
|   // Note: If you use CUDNN RNN's, calling | ||||
|   // set_philox_offset_per_thread instead of set_offset will cause the | ||||
|   // cudnn RNN rng state to become stale. | ||||
|   TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4"); | ||||
|   if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) { | ||||
|     state_->philox_offset_per_thread_ = offset; | ||||
|   } else { | ||||
|     state_->offset_intragraph_ = offset; | ||||
|   } | ||||
|   state_->philox_offset_per_thread_ = offset; | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl. | ||||
|  */ | ||||
| uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const { | ||||
|   if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) { | ||||
|     return state_->philox_offset_per_thread_; | ||||
|   } else { | ||||
|     return state_->offset_intragraph_; | ||||
|   } | ||||
|   return state_->philox_offset_per_thread_; | ||||
| } | ||||
|  | ||||
| /** | ||||
|  | ||||
| @ -19,7 +19,7 @@ | ||||
| #define DLPACK_MAJOR_VERSION 1 | ||||
|  | ||||
| /*! \brief The current minor version of dlpack */ | ||||
| #define DLPACK_MINOR_VERSION 1 | ||||
| #define DLPACK_MINOR_VERSION 0 | ||||
|  | ||||
| /*! \brief DLPACK_DLL prefix for windows */ | ||||
| #ifdef _WIN32 | ||||
| @ -32,7 +32,9 @@ | ||||
| #define DLPACK_DLL | ||||
| #endif | ||||
|  | ||||
| // NOLINTNEXTLINE(modernize-deprecated-headers) | ||||
| #include <stdint.h> | ||||
| // NOLINTNEXTLINE(modernize-deprecated-headers) | ||||
| #include <stddef.h> | ||||
|  | ||||
| #ifdef __cplusplus | ||||
| @ -157,26 +159,6 @@ typedef enum { | ||||
|   kDLComplex = 5U, | ||||
|   /*! \brief boolean */ | ||||
|   kDLBool = 6U, | ||||
|   /*! \brief FP8 data types */ | ||||
|   kDLFloat8_e3m4 = 7U, | ||||
|   kDLFloat8_e4m3 = 8U, | ||||
|   kDLFloat8_e4m3b11fnuz = 9U, | ||||
|   kDLFloat8_e4m3fn = 10U, | ||||
|   kDLFloat8_e4m3fnuz = 11U, | ||||
|   kDLFloat8_e5m2 = 12U, | ||||
|   kDLFloat8_e5m2fnuz = 13U, | ||||
|   kDLFloat8_e8m0fnu = 14U, | ||||
|   /*! \brief FP6 data types | ||||
|    * Setting bits != 6 is currently unspecified, and the producer must ensure it is set | ||||
|    * while the consumer must stop importing if the value is unexpected. | ||||
|    */ | ||||
|   kDLFloat6_e2m3fn = 15U, | ||||
|   kDLFloat6_e3m2fn = 16U, | ||||
|   /*! \brief FP4 data types | ||||
|    * Setting bits != 4 is currently unspecified, and the producer must ensure it is set | ||||
|    * while the consumer must stop importing if the value is unexpected. | ||||
|    */ | ||||
|   kDLFloat4_e2m1fn = 17U, | ||||
| } DLDataTypeCode; | ||||
|  | ||||
| /*! | ||||
| @ -190,12 +172,6 @@ typedef enum { | ||||
|  *   - int8: type_code = 0, bits = 8, lanes = 1 | ||||
|  *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1 | ||||
|  *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits) | ||||
|  *   - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory) | ||||
|  *   - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory) | ||||
|  *   - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory) | ||||
|  * | ||||
|  *  When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e., | ||||
|  *  for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element. | ||||
|  */ | ||||
| typedef struct { | ||||
|   /*! | ||||
| @ -253,12 +229,12 @@ typedef struct { | ||||
|   /*! \brief The data type of the pointer*/ | ||||
|   DLDataType dtype; | ||||
|   /*! \brief The shape of the tensor */ | ||||
|   int64_t* shape; | ||||
|   const int64_t* shape; | ||||
|   /*! | ||||
|    * \brief strides of the tensor (in number of elements, not bytes) | ||||
|    *  can be NULL, indicating tensor is compact and row-majored. | ||||
|    */ | ||||
|   int64_t* strides; | ||||
|   const int64_t* strides; | ||||
|   /*! \brief The offset in bytes to the beginning pointer to data */ | ||||
|   uint64_t byte_offset; | ||||
| } DLTensor; | ||||
| @ -293,7 +269,7 @@ typedef struct DLManagedTensor { | ||||
|   void (*deleter)(struct DLManagedTensor * self); | ||||
| } DLManagedTensor; | ||||
|  | ||||
| // bit masks used in the DLManagedTensorVersioned | ||||
| // bit masks used in in the DLManagedTensorVersioned | ||||
|  | ||||
| /*! \brief bit mask to indicate that the tensor is read only. */ | ||||
| #define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL) | ||||
| @ -306,14 +282,6 @@ typedef struct DLManagedTensor { | ||||
|  */ | ||||
| #define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL) | ||||
|  | ||||
| /* | ||||
|  * \brief bit mask to indicate that whether a sub-byte type is packed or padded. | ||||
|  * | ||||
|  * The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can | ||||
|  * be set by the producer to signal that a tensor of sub-byte type is padded. | ||||
|  */ | ||||
| #define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL) | ||||
|  | ||||
| /*! | ||||
|  * \brief A versioned and managed C Tensor object, manage memory of DLTensor. | ||||
|  * | ||||
|  | ||||
| @ -171,8 +171,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { | ||||
|  | ||||
|   POINTWISE_BOXED(fill_.Scalar); | ||||
|   POINTWISE_BOXED(zero_); | ||||
|   // This is special because this op doesn't return anything | ||||
|   m.impl("_assert_tensor_metadata", native::_assert_tensor_metadata); | ||||
|  | ||||
| #undef UNARY_POINTWISE | ||||
| #undef UNARY_POINTWISE_ALL | ||||
|  | ||||
| @ -81,7 +81,7 @@ Tensor math_channel_shuffle(const Tensor& self, int64_t groups) { | ||||
|   // TODO: contiguous can be made to preserve the memory format | ||||
|   // of the input. However since the above reshape clobbers h and w | ||||
|   // it may not be safe to do that, since channels_last contiguous | ||||
|   // may think oc and the last dim correspond to h,w? | ||||
|   // may think oc and and the last dim correspond to h,w? | ||||
|   // It is not clear, however from initial looking around it feels that | ||||
|   // this may not be correct. | ||||
|   // In this case channels last will likely require custom implementation | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| #pragma once | ||||
| #include <ATen/core/Tensor.h> | ||||
| #include <ATen/Config.h> | ||||
| #include <cstdint> | ||||
|  | ||||
| @ -97,38 +97,43 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) { | ||||
|   int64_t nDims = self.dim(); | ||||
|   TORCH_CHECK(nDims >= 2, "dimensions must larger than 1"); | ||||
|  | ||||
|   auto height = self.sym_size(0); | ||||
|   auto width = self.sym_size(1); | ||||
|   int64_t height = self.size(0); | ||||
|   int64_t width = self.size(1); | ||||
|  | ||||
|   if (nDims > 2) { | ||||
|     int64_t dim1 = height; | ||||
|     for (const auto i : c10::irange(1, nDims)) { | ||||
|       if (self.sym_size(i) != height) { | ||||
|       if (self.size(i) != dim1) { | ||||
|         TORCH_CHECK(false, "all dimensions of input must be of equal length"); | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   auto storage_offset = self.sym_storage_offset(); | ||||
|   auto size = std::min(height, width); | ||||
|   int64_t storage_offset = self.storage_offset(); | ||||
|   std::vector<int64_t> sizes; | ||||
|   std::vector<int64_t> strides; | ||||
|   int64_t size = std::min(height, width); | ||||
|  | ||||
|   int64_t stride = 0; | ||||
|   for (const auto i : c10::irange(nDims)) { | ||||
|     stride += self.stride(i); | ||||
|   } | ||||
|   std::vector<SymInt> strides{stride}; | ||||
|   std::vector<SymInt> sizes{size}; | ||||
|   strides.push_back(stride); | ||||
|   sizes.push_back(size); | ||||
|  | ||||
|   auto main_diag = self.as_strided_symint(sizes, strides, storage_offset); | ||||
|   auto main_diag = self.as_strided(sizes, strides, storage_offset); | ||||
|   main_diag.fill_(fill_value); | ||||
|  | ||||
|   if (wrap && nDims == 2 && height > width + 1) { | ||||
|     auto step = width + 1; | ||||
|     auto wrap_size = ((self.numel() + step - 1) / step) - size; | ||||
|     std::vector<SymInt> wrap_sizes{wrap_size}; | ||||
|     std::vector<int64_t> wrap_sizes; | ||||
|  | ||||
|     auto offset = self.stride(0) * (width + 1); | ||||
|     int64_t step = width + 1; | ||||
|     int64_t wrap_size = ((self.numel() + step - 1) / step) - size; | ||||
|     wrap_sizes.push_back(wrap_size); | ||||
|  | ||||
|     auto wrap_diag = self.as_strided_symint(wrap_sizes, strides, storage_offset + offset); | ||||
|     int64_t offset = self.stride(0) * (width + 1); | ||||
|  | ||||
|     auto wrap_diag = self.as_strided(wrap_sizes, strides, storage_offset + offset); | ||||
|     wrap_diag.fill_(fill_value); | ||||
|   } | ||||
|  | ||||
|  | ||||
| @ -23,6 +23,8 @@ Tensor& max_unpooling2d_forward_out_cpu( | ||||
|   // Nondeterministic with duplicate indices | ||||
|   at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out"); | ||||
|  | ||||
|   auto oheight = output_size[0]; | ||||
|   auto owidth = output_size[1]; | ||||
|   TORCH_CHECK( | ||||
|       indices_.scalar_type() == at::ScalarType::Long, | ||||
|       "elements in indices should be type int64 but got: ", indices_.scalar_type()); | ||||
| @ -43,9 +45,6 @@ Tensor& max_unpooling2d_forward_out_cpu( | ||||
|                 self_.sizes(), " with dimension ", i , " being empty."); | ||||
|   } | ||||
|  | ||||
|   auto oheight = output_size[0]; | ||||
|   auto owidth = output_size[1]; | ||||
|  | ||||
|   auto memory_format = self_.suggest_memory_format(); | ||||
|   auto self = self_.contiguous(memory_format); | ||||
|   auto indices = indices_.contiguous(memory_format); | ||||
|  | ||||
| @ -73,7 +73,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value) | ||||
|     for (const auto i : c10::irange((size_t)l_pad)) { | ||||
|         auto pad_idx = pad.size() - ((i + 1) * 2); | ||||
|         auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1]; | ||||
|         TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ", | ||||
|         TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ", | ||||
|                  pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, " | ||||
|                  "which is invalid. Check dimension ", l_diff + i, " of your input."); | ||||
|         new_shape.emplace_back(new_dim); | ||||
|  | ||||
| @ -85,11 +85,11 @@ void cpu_max_unpool( | ||||
|     if constexpr (is_3d) { | ||||
|       TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(), | ||||
|           " (output volumes are of size ", output_depth, | ||||
|           "x", output_height, "x", output_width, ")"); | ||||
|           "x", output_height, "x", output_width); | ||||
|     } else { | ||||
|       TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(), | ||||
|           " (output volumes are of size ", output_height, | ||||
|           "x", output_width, ")"); | ||||
|           "x", output_width); | ||||
|     } | ||||
|   } | ||||
|  | ||||
|  | ||||
| @ -1138,14 +1138,9 @@ bool is_blockwise_1x16_scaling(const at::Tensor& t, const at::Tensor& scale) { | ||||
| bool is_blockwise_1x32_scaling(const at::Tensor& t, const at::Tensor& scale) { | ||||
|   // TODO: We might want to enforce some structure on the shapes of the scale | ||||
|   // tensors | ||||
|   bool is_fp8_path = (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu | ||||
|       && scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4)); | ||||
|   bool is_packed_fp4_path = false; | ||||
| #ifdef USE_ROCM | ||||
|   is_packed_fp4_path = (t.scalar_type() == ScalarType::Float4_e2m1fn_x2 && scale.scalar_type() == at::kFloat8_e8m0fnu | ||||
|       && scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1) * 2, 32), 4)); | ||||
| #endif | ||||
|   return (is_fp8_path || is_packed_fp4_path) && scale.is_contiguous(); | ||||
|   return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu | ||||
|       && scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4) | ||||
|       && scale.is_contiguous()); | ||||
| } | ||||
|  | ||||
| bool is_blockwise_1x128_scaling(const at::Tensor& t, const at::Tensor& scale) { | ||||
| @ -1386,15 +1381,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, | ||||
|     TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}), | ||||
|                 "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950"); | ||||
|  | ||||
|     int packed_factor = 1; | ||||
|     if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2) { | ||||
|       // For float4 data type, each byte stores two 4-bit floating-point values, | ||||
|       // effectively packing two elements into one byte. | ||||
|       packed_factor = 2; | ||||
|     } | ||||
|     TORCH_CHECK(mat1.size(0) % 16 == 0 && (mat1.size(1) * packed_factor) % 128 == 0 && | ||||
|                 mat2.size(1) % 16 == 0, | ||||
|                 "M, N must be multiples of 16 and K must be multiple of 128 for block-wise scaling"); | ||||
|     TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 && | ||||
|                 mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0, | ||||
|                 "Matrix dimensions must be multiples of 32 for block-wise scaling"); | ||||
|  | ||||
|     TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 || | ||||
|                 out.scalar_type() == ScalarType::Half, | ||||
|  | ||||
| @ -999,41 +999,12 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) { | ||||
|       dtypes[i] = iter.dtype(i); | ||||
|     } | ||||
|     auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter); | ||||
| #ifdef USE_ROCM | ||||
|     constexpr int grp_sz = 128; | ||||
|     launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) { | ||||
|       if (unrl) { | ||||
|         auto offsets0 = offset_calc.get(idx); | ||||
|         auto offsets1 = offset_calc.get(idx + grp_sz); | ||||
|         auto offsets2 = offset_calc.get(idx + grp_sz * 2); | ||||
|         auto offsets3 = offset_calc.get(idx + grp_sz * 3); | ||||
|         void* out0 = data[0] + offsets0[0]; | ||||
|         void* out1 = data[0] + offsets1[0]; | ||||
|         void* out2 = data[0] + offsets2[0]; | ||||
|         void* out3 = data[0] + offsets3[0]; | ||||
|         arg0_t result0 = invoke(f, &data[1], &offsets0[1], &dtypes[1], 1); | ||||
|         arg0_t result1 = invoke(f, &data[1], &offsets1[1], &dtypes[1], 1); | ||||
|         arg0_t result2 = invoke(f, &data[1], &offsets2[1], &dtypes[1], 1); | ||||
|         arg0_t result3 = invoke(f, &data[1], &offsets3[1], &dtypes[1], 1); | ||||
|         c10::cast_and_store<arg0_t>(dtypes[0], out0, result0); | ||||
|         c10::cast_and_store<arg0_t>(dtypes[0], out1, result1); | ||||
|         c10::cast_and_store<arg0_t>(dtypes[0], out2, result2); | ||||
|         c10::cast_and_store<arg0_t>(dtypes[0], out3, result3); | ||||
|       } else { | ||||
|         auto offsets = offset_calc.get(idx); | ||||
|         void* out = data[0] + offsets[0]; | ||||
|         arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1); | ||||
|         c10::cast_and_store<arg0_t>(dtypes[0], out, result); | ||||
|       } | ||||
|     }); | ||||
| #else | ||||
|     launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) { | ||||
|       auto offsets = offset_calc.get(idx); | ||||
|       void* out = data[0] + offsets[0]; | ||||
|       arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1); | ||||
|       c10::cast_and_store<arg0_t>(dtypes[0], out, result); | ||||
|     }); | ||||
| #endif | ||||
|   } | ||||
| } | ||||
|  | ||||
|  | ||||
| @ -51,7 +51,7 @@ std::vector<Tensor> foreach_tensor_list_op( | ||||
|       Op<opmath_t>(), | ||||
|       alpha.to<opmath_t>()); | ||||
|  | ||||
|   return std::move(tensor_lists[2]); | ||||
|   return tensor_lists[2]; | ||||
| } | ||||
|  | ||||
| template <typename T, template <class> class Op> | ||||
|  | ||||
| @ -45,7 +45,7 @@ std::vector<Tensor> foreach_binary_op( | ||||
|           /* res_arg_index */ 1>(), | ||||
|       Op<opmath_t>(), | ||||
|       scalar.to<opmath_t>()); | ||||
|   return std::move(tensor_lists[1]); | ||||
|   return tensor_lists[1]; | ||||
| } | ||||
|  | ||||
| template <typename T, template <class> class Op> | ||||
|  | ||||
| @ -33,7 +33,7 @@ std::vector<Tensor> foreach_binary_op( | ||||
|   } | ||||
|  | ||||
|   tensor_lists.emplace_back(tensors.vec()); | ||||
|   tensor_lists.emplace_back(std::move(vec_res)); | ||||
|   tensor_lists.emplace_back(vec_res); | ||||
|  | ||||
|   using opmath_t = at::opmath_type<T>; | ||||
|   multi_tensor_apply<2, opmath_t>( | ||||
| @ -46,7 +46,7 @@ std::vector<Tensor> foreach_binary_op( | ||||
|           /* res_arg_index */ 1>(), | ||||
|  | ||||
|       Op<opmath_t>()); | ||||
|   return std::move(tensor_lists[1]); | ||||
|   return tensor_lists[1]; | ||||
| } | ||||
|  | ||||
| template <typename T, template <class> class Op> | ||||
|  | ||||
| @ -56,7 +56,7 @@ std::vector<Tensor> foreach_binary_op( | ||||
|       Op<opmath_t>(), | ||||
|       scalar.data_ptr<T>(), | ||||
|       alpha.to<opmath_t>()); | ||||
|   return std::move(tensor_lists[1]); | ||||
|   return tensor_lists[1]; | ||||
| } | ||||
|  | ||||
| template <typename T, template <class> class Op> | ||||
|  | ||||
| @ -57,7 +57,7 @@ std::vector<Tensor> foreach_pointwise_op( | ||||
|             scalar.to<opmath_t>()); | ||||
|       }); | ||||
|  | ||||
|   return std::move(tensor_lists[3]); | ||||
|   return tensor_lists[3]; | ||||
| } | ||||
|  | ||||
| template <template <class> class Op> | ||||
| @ -160,7 +160,7 @@ std::vector<Tensor> foreach_pointwise_op( | ||||
|             Op<opmath_t>()); | ||||
|       }); | ||||
|  | ||||
|   return std::move(tensor_lists[3]); | ||||
|   return tensor_lists[3]; | ||||
| } | ||||
|  | ||||
| #define FOREACH_POINTWISE_OP_SCALAR(NAME, OP)                           \ | ||||
|  | ||||
| @ -37,7 +37,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda( | ||||
|     vec_res.emplace_back(at::native::empty_like(t)); | ||||
|   } | ||||
|   std::vector<std::vector<at::Tensor>> tensor_lists{ | ||||
|       tensors1.vec(), tensors2.vec(), tensors3.vec(), std::move(vec_res)}; | ||||
|       tensors1.vec(), tensors2.vec(), tensors3.vec(), vec_res}; | ||||
|  | ||||
|   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( | ||||
|       at::ScalarType::Half, | ||||
| @ -56,7 +56,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda( | ||||
|             LerpFunctor<opmath_t>()); | ||||
|       }); | ||||
|  | ||||
|   return std::move(tensor_lists[3]); | ||||
|   return tensor_lists[3]; | ||||
| } | ||||
|  | ||||
| void foreach_tensor_lerp_ternary_cuda_( | ||||
| @ -104,7 +104,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda( | ||||
|     vec_res.emplace_back(at::native::empty_like(t)); | ||||
|   } | ||||
|   std::vector<std::vector<at::Tensor>> tensor_lists{ | ||||
|       tensors1.vec(), tensors2.vec(), std::move(vec_res)}; | ||||
|       tensors1.vec(), tensors2.vec(), vec_res}; | ||||
|  | ||||
|   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( | ||||
|       at::ScalarType::Half, | ||||
| @ -124,7 +124,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda( | ||||
|             weight.to<opmath_t>()); | ||||
|       }); | ||||
|  | ||||
|   return std::move(tensor_lists[2]); | ||||
|   return tensor_lists[2]; | ||||
| } | ||||
|  | ||||
| void foreach_tensor_lerp_list_cuda_( | ||||
| @ -173,7 +173,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda( | ||||
|     vec_res.emplace_back(at::native::empty_like(t)); | ||||
|   } | ||||
|   std::vector<std::vector<at::Tensor>> tensor_lists{ | ||||
|       tensors1.vec(), tensors2.vec(), std::move(vec_res)}; | ||||
|       tensors1.vec(), tensors2.vec(), vec_res}; | ||||
|  | ||||
|   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( | ||||
|       at::ScalarType::Half, | ||||
| @ -193,7 +193,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda( | ||||
|             LerpFunctor<opmath_t>()); | ||||
|       }); | ||||
|  | ||||
|   return std::move(tensor_lists[2]); | ||||
|   return tensor_lists[2]; | ||||
| } | ||||
|  | ||||
| void foreach_tensor_lerp_scalarlist_cuda_( | ||||
|  | ||||
| @ -67,7 +67,7 @@ std::vector<Tensor> foreach_unary_op(TensorList tensors) { | ||||
|           /* res_arg_index */ 1>(), | ||||
|       Op<opmath_t>()); | ||||
|  | ||||
|   return std::move(tensor_lists[1]); | ||||
|   return tensor_lists[1]; | ||||
| } | ||||
|  | ||||
| template <typename scalar_t, template <class> class Op> | ||||
|  | ||||
| @ -125,6 +125,8 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_, | ||||
|   TORCH_CHECK( | ||||
|       indices_.scalar_type() == at::ScalarType::Long, | ||||
|       "elements in indices should be type int64 but got: ", indices_.scalar_type()); | ||||
|   auto oheight = output_size[0]; | ||||
|   auto owidth = output_size[1]; | ||||
|  | ||||
|   TensorArg output_arg{output, "output", 1}, self_arg{self_, "self_", 2}, | ||||
|       indices_arg{indices_, "indices_", 3}; | ||||
| @ -147,9 +149,6 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_, | ||||
|       output_size.size() == 2, | ||||
|       "There should be exactly two elements (height, width) in output_size, but got ", output_size.size(), " elements."); | ||||
|  | ||||
|   auto oheight = output_size[0]; | ||||
|   auto owidth = output_size[1]; | ||||
|  | ||||
|   int64_t dimw = 2; | ||||
|   int64_t dimh = 1; | ||||
|   int64_t numBatch = 1; | ||||
| @ -218,6 +217,9 @@ static void max_unpooling3d_shape_check( | ||||
|     IntArrayRef stride, | ||||
|     IntArrayRef padding, | ||||
|     const char *fn_name) { | ||||
|   int64_t oT = output_size[0]; | ||||
|   int64_t oH = output_size[1]; | ||||
|   int64_t oW = output_size[2]; | ||||
|   TORCH_CHECK( | ||||
|       indices.scalar_type() == at::ScalarType::Long, | ||||
|       "elements in indices should be type int64 but got: ", indices.scalar_type()); | ||||
| @ -248,10 +250,6 @@ static void max_unpooling3d_shape_check( | ||||
|       "strides should be greater than zero, but got stride: ", | ||||
|       stride); | ||||
|  | ||||
|   int64_t oT = output_size[0]; | ||||
|   int64_t oH = output_size[1]; | ||||
|   int64_t oW = output_size[2]; | ||||
|  | ||||
|   int dimw = 3; | ||||
|   int dimh = 2; | ||||
|   int dimt = 1; | ||||
| @ -404,6 +402,8 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_, | ||||
|     const Tensor& indices_, | ||||
|     IntArrayRef output_size, | ||||
|     Tensor& grad_input) { | ||||
|   int64_t oheight = output_size[0]; | ||||
|   int64_t owidth = output_size[1]; | ||||
|   TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous"); | ||||
|   TORCH_CHECK( | ||||
|       indices_.scalar_type() == at::ScalarType::Long, | ||||
| @ -426,9 +426,6 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_, | ||||
|  | ||||
|   TORCH_CHECK(output_size.size() == 2, "output_size must have two elements, got size: ", output_size.size()); | ||||
|  | ||||
|   int64_t oheight = output_size[0]; | ||||
|   int64_t owidth = output_size[1]; | ||||
|  | ||||
|   int64_t nInputCols, nInputRows, nInputPlane; | ||||
|  | ||||
|   int dimw = 2; | ||||
| @ -508,14 +505,13 @@ at::Tensor& max_unpooling3d_backward_out_cuda(const Tensor& grad_output_, | ||||
|     IntArrayRef padding, | ||||
|     Tensor& grad_input) { | ||||
|   TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous"); | ||||
|  | ||||
|   max_unpooling3d_shape_check( | ||||
|     self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()"); | ||||
|  | ||||
|   int64_t oT = output_size[0]; | ||||
|   int64_t oH = output_size[1]; | ||||
|   int64_t oW = output_size[2]; | ||||
|  | ||||
|   max_unpooling3d_shape_check( | ||||
|     self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()"); | ||||
|  | ||||
|   int batchSize = 0; | ||||
|   int inputSlices = 0; | ||||
|   int inputTime = 0; | ||||
|  | ||||
| @ -300,6 +300,8 @@ void nonzero_static_cuda_out_impl( | ||||
|     int64_t size, | ||||
|     int64_t fill_value, | ||||
|     Tensor& out) { | ||||
| #if defined(CUDA_VERSION) || defined(USE_ROCM) | ||||
|  | ||||
|   Tensor self_contiguous_ = self.contiguous(); | ||||
|   // see comment in nonzero_cuda_out_impl on reqs for out | ||||
|   bool out_correct_size = | ||||
| @ -375,6 +377,9 @@ void nonzero_static_cuda_out_impl( | ||||
|   if (need_to_copy) { | ||||
|     out.copy_(out_temp); | ||||
|   } | ||||
| #else | ||||
|   TORCH_CHECK(false, "Nonzero_static is not supported for cuda <= 11.4"); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| Tensor& nonzero_out_cuda(const Tensor& self, Tensor& out) { | ||||
|  | ||||
| @ -226,38 +226,6 @@ __global__ void CatArrayBatchedCopy_contig( | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
| template <typename T, typename IndexType, int Dims, int batch_size, int stride_size, int alignment, int elems_per_vec> | ||||
| __global__ void CatArrayBatchedCopy_vectorized( | ||||
|     char* output, | ||||
|     CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs, | ||||
|     TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os, | ||||
|     const int concatDim, | ||||
|     IndexType trailingSize) { | ||||
|  | ||||
|     IndexType tid = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|     IndexType nElements = inputs.nElements[blockIdx.y] / elems_per_vec; | ||||
|  | ||||
|     if(tid >= nElements) return; | ||||
|  | ||||
|     const char * data = (char*)inputs.input[blockIdx.y]; | ||||
|     IndexType offset = inputs.offset[blockIdx.y] * trailingSize / elems_per_vec; | ||||
|     IndexType dimSize = inputs.dimSize[blockIdx.y] * trailingSize / elems_per_vec; | ||||
|     int64_t dataOffset = (int64_t)offset  * alignment; // in bytes | ||||
|  | ||||
|     IndexType stride = gridDim.x * blockDim.x; | ||||
|  | ||||
|     while( tid < nElements){ | ||||
|       int64_t elementOffset = (int64_t)CatArrIndexToOffset<IndexType, Dims>::compute( | ||||
|                     os.tensorSize, os.tensorStride, dimSize, concatDim, tid) * alignment; // in bytes | ||||
|       auto vec = at::native::memory::ld_vec<alignment>(data + (int64_t)alignment * tid); | ||||
|       at::native::memory::st_vec<alignment>(output + dataOffset + elementOffset, vec); | ||||
|       tid += stride; | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| /* | ||||
|   Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads | ||||
|   to improve memory bandwidth throughput. | ||||
| @ -328,27 +296,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i | ||||
|   scalar_t *data = (scalar_t *)(out.mutable_data_ptr()); | ||||
|   CatArrInputTensorMetadata<scalar_t, unsigned int, batch_size, stride_size> catMetaData; | ||||
|   TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam; | ||||
|   // If all batches are contiguous we can call a specialized implementation | ||||
|   // which requires the input tensor addresses to be aligned to a | ||||
|   // 16 Byte boundary. | ||||
|  | ||||
|   constexpr bool isContig = stride_size == 1; | ||||
|   bool isAligned = true; | ||||
|   constexpr int alignment = 16; | ||||
|  | ||||
|   // Next, let's initialize the size, stride arrays for the output Tensor. | ||||
|   // for contig case, we'll canonicalize output strides, so that | ||||
|   // we don't have arbitrary strides for dims of size 0 | ||||
|   size_t stride0 = 1; | ||||
|   if (memory_format == c10::MemoryFormat::Contiguous) { | ||||
|     for (int i = nDims - 1; i >= 0; --i) { | ||||
|     for (int i = 0; i < nDims; ++i) { | ||||
|       outputParam.tensorSize[i] = out.size(i); | ||||
|       if (isContig) { | ||||
|         outputParam.tensorStride[i] = stride0; | ||||
|         stride0 *= out.size(i); | ||||
|       } else { | ||||
|         outputParam.tensorStride[i] = out.stride(i); | ||||
|       } | ||||
|       outputParam.tensorStride[i] = out.stride(i); | ||||
|     } | ||||
|   } else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) { | ||||
|     // permute the semantics of dims from NCHW to NHWC so that the input | ||||
| @ -367,15 +320,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i | ||||
|  | ||||
|   at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream(); | ||||
|  | ||||
|   // If all batches are contiguous we can call a specialized implementation | ||||
|   // which requires the input tensor addresses to be aligned to a | ||||
|   // 16 Byte boundary. | ||||
|  | ||||
|   // for channels last computing slice size correctly is much more involved, so we never send it | ||||
|   // on the fully vectorized path | ||||
|   // we need output stride in cat dimension to be multiple of alignment, | ||||
|   // if we ever use it to compute offsets | ||||
|   // for catting in 0th dimension it doesn't matter | ||||
|   bool isInOutAligned = isContig && at::native::memory::get_alignment(data) >= alignment && | ||||
|                         memory_format == c10::MemoryFormat::Contiguous && (dimension == 0 || | ||||
|                         outputParam.tensorStride[dimension - 1] * sizeof(scalar_t) % alignment == 0); | ||||
|   bool isContig = true; | ||||
|   bool isAligned = true; | ||||
|   unsigned int max_elements_per_tensor = 0; | ||||
|  | ||||
|   // Now we loop | ||||
| @ -391,16 +341,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i | ||||
|       // high-dimensional tensor | ||||
|       if (inputs[i+batchCounter].get().numel() > 0) { | ||||
|         dimSize = inputs[i+batchCounter].get().size(dimension); | ||||
|         if (isInOutAligned) { | ||||
|           auto t = inputs[i+batchCounter].get(); | ||||
|           // similarly to output stride, we cannot trust stride value to | ||||
|           // determine slice size if the corresponding dimension is 1 | ||||
|           // we have to multiply all the subsequent sizes | ||||
|           int64_t slice_size = dimension == 0 ? t.numel() : t.sizes()[dimension - 1] != 1 ? | ||||
|              t.strides()[dimension - 1] : c10::multiply_integers(t.sizes().begin() + dimension, t.sizes().end()); | ||||
|           slice_size *= sizeof(scalar_t); | ||||
|           isInOutAligned &= (slice_size % alignment == 0); | ||||
|         } | ||||
|       } | ||||
|  | ||||
|       catMetaData.input[batchCounter] = (scalar_t*)(inputs[i+batchCounter].get().const_data_ptr()); | ||||
| @ -411,12 +351,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i | ||||
| #ifdef USE_ROCM | ||||
|       // On ROCm, CatArrayBatchedCopy_contig is faster | ||||
|       isAligned = false; | ||||
|       isInOutAligned = false; | ||||
| #else | ||||
|       // If at least one of the inputs is not aligned, we can't call the | ||||
|       // CatArrayBatchedCopy_alignedK_contig | ||||
|       isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]); | ||||
|       isInOutAligned &= at::native::memory::get_alignment(catMetaData.input[batchCounter]) >= alignment; | ||||
| #endif | ||||
|  | ||||
|       if (stride_size > 1) { | ||||
| @ -427,6 +365,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i | ||||
|           catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j]; | ||||
|         } | ||||
|         catMetaData.isContiguous[batchCounter] = false; | ||||
|         isContig = false; | ||||
|       } else { | ||||
|         catMetaData.isContiguous[batchCounter] = true; | ||||
|       } | ||||
| @ -449,13 +388,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i | ||||
|           max_elements_per_tensor, batchCounter); | ||||
| #else | ||||
|     dim3 applyBlock, catGrid; | ||||
|     if (isInOutAligned) { | ||||
|       std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, alignment>( | ||||
|         max_elements_per_tensor, batchCounter); | ||||
|     } else if (isContig && isAligned && sizeof(scalar_t) > 2) { | ||||
|     if (isContig && sizeof(scalar_t) > 2) { | ||||
|       std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_16>( | ||||
|           max_elements_per_tensor, batchCounter); | ||||
|     } else if (isContig && isAligned && sizeof(scalar_t) == 2) { | ||||
|     } else if (isContig && sizeof(scalar_t) == 2) { | ||||
|       std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_8>( | ||||
|           max_elements_per_tensor, batchCounter); | ||||
|     } else { | ||||
| @ -463,30 +399,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i | ||||
|       getCatGrid(batchCounter, catGrid); | ||||
|     } | ||||
| #endif | ||||
|     int32_t trailingSize; | ||||
|     TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam; | ||||
|     if (isInOutAligned) { | ||||
|       // in this case we can and should flatten the tensors after the cat dim | ||||
|       // we want to view the tensors as if consisting of `alignment`-sized elements | ||||
|       // however, we might not be able to cleanly divide just the last dim - | ||||
|       // it might not be the multiple of alignment. | ||||
|       // however, we know that the full concatted slice is multiple of alignment, | ||||
|       // so if we flatten all the dims after and including concat dim, | ||||
|       // it will be divisible by alignment | ||||
|       // then we need to divide last out size by elems_per_vec, | ||||
|       // and divide all strides except last by elems_per_vec (last stride is 1 always) | ||||
|       // for input, we will fix up the sizes and strides in the kernel directly | ||||
|       kernelOutputParam = outputParam; | ||||
|       nDims = dimension + 1; | ||||
|       constexpr auto elems_per_vec = alignment / sizeof(scalar_t); | ||||
|       auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1]; | ||||
|       kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec; | ||||
|       trailingSize = outputParam.tensorStride[dimension]; | ||||
|       kernelOutputParam.tensorStride[dimension] = 1; | ||||
|       for (int i = 0; i < dimension; ++i) { | ||||
|         kernelOutputParam.tensorStride[i] /= elems_per_vec; | ||||
|       } | ||||
|     } | ||||
|  | ||||
|     if (memory_format != c10::MemoryFormat::Contiguous) { | ||||
|       switch (dimension) { | ||||
| @ -501,12 +413,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i | ||||
|     } | ||||
|     // Template Declarations for dim = 1, 2, 3, 4 | ||||
| #define HANDLE_CASE(DIMS) \ | ||||
|     if (isInOutAligned) {\ | ||||
|       constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \ | ||||
|       CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\ | ||||
|       catGrid, applyBlock, 0, stream.stream()>>>(\ | ||||
|         (char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\ | ||||
|     } else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\ | ||||
|     if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\ | ||||
|       CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\ | ||||
|           catGrid, applyBlock, 0, stream.stream()>>>(\ | ||||
|               data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\ | ||||
|  | ||||
| @ -221,9 +221,22 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_ | ||||
|   std::optional<CuFFTConfig> uncached_plan; | ||||
|   const CuFFTConfig * config = nullptr; | ||||
|  | ||||
|   // Workaround for gh-63152, gh-58724 | ||||
|   // Bluestein plans in CUDA 11.1 (cufft 10.3) cannot be re-used | ||||
|   // Bluestein's algorithm is only used when a size has large prime factors, | ||||
|   // sizes with only small prime factors can still be cached | ||||
|   if (plan_cache.max_size() > 0) { | ||||
|   bool use_caching = true; | ||||
| #ifdef CUFFT_VERSION | ||||
|   if constexpr (10300 <= CUFFT_VERSION && CUFFT_VERSION < 10400) { | ||||
|     // Only cache plans for transforms with small prime factors | ||||
|     use_caching = std::none_of( | ||||
|         signal_size.begin() + 1, signal_size.end(), [](int64_t dim_size) { | ||||
|       return has_large_prime_factor(dim_size); | ||||
|     }); | ||||
|   } | ||||
| #endif | ||||
|  | ||||
|   if (use_caching && plan_cache.max_size() > 0) { | ||||
|     guard.lock(); | ||||
|     if (plan_cache.max_size() > 0) {  // check again after acquiring the lock | ||||
|       config = &plan_cache.lookup(Params); | ||||
|  | ||||
| @ -5,20 +5,12 @@ | ||||
|  | ||||
| namespace at::native { | ||||
|  | ||||
| __global__ void weight_int8pack_mm_kernel( | ||||
|     const float* x, | ||||
|     const int8_t* w, | ||||
|     const float* scale, | ||||
|     float* out, | ||||
|     int B, | ||||
|     int K, | ||||
|     int N) { | ||||
| __global__ void weight_int8pack_mm_kernel(const float* x, const int8_t* w, const float* scale, float* out, int B, int K, int N) { | ||||
|   // one thread per output element: [B, N] | ||||
|   int b = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|   int n = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|  | ||||
|   if (b >= B || n >= N) | ||||
|     return; | ||||
|   if (b >= B || n >= N) return; | ||||
|  | ||||
|   float acc = 0.0f; | ||||
|   for (int k = 0; k < K; ++k) { | ||||
| @ -28,11 +20,7 @@ __global__ void weight_int8pack_mm_kernel( | ||||
|   out[b * N + n] = acc * scale[n]; | ||||
| } | ||||
|  | ||||
| void launch_weight_int8pack_mm_cuda_kernel( | ||||
|     const Tensor& x, | ||||
|     const Tensor& w_int8, | ||||
|     const Tensor& scale, | ||||
|     Tensor& out) { | ||||
| void launch_weight_int8pack_mm_cuda_kernel(const Tensor& x, const Tensor& w_int8, const Tensor& scale, Tensor& out) { | ||||
|   const int B = x.size(0); | ||||
|   const int K = x.size(1); | ||||
|   const int N = w_int8.size(0); | ||||
| @ -47,16 +35,12 @@ void launch_weight_int8pack_mm_cuda_kernel( | ||||
|       w_int8.data_ptr<int8_t>(), | ||||
|       scale.data_ptr<float>(), | ||||
|       out.data_ptr<float>(), | ||||
|       B, | ||||
|       K, | ||||
|       N); | ||||
|       B, K, N); | ||||
| } | ||||
|  | ||||
|  | ||||
| // Main GPU entry point | ||||
| at::Tensor _weight_int8pack_mm_cuda( | ||||
|     const at::Tensor& x, | ||||
|     const at::Tensor& w_int8, | ||||
|     const at::Tensor& scale) { | ||||
| at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int8, const at::Tensor& scale) { | ||||
|   // --- Check inputs --- | ||||
|   TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor"); | ||||
|   TORCH_CHECK(w_int8.is_cuda(), "w must be a CUDA tensor"); | ||||
| @ -66,16 +50,12 @@ at::Tensor _weight_int8pack_mm_cuda( | ||||
|   TORCH_CHECK(w_int8.dim() == 2, "w must be 2D"); | ||||
|   TORCH_CHECK(scale.dim() == 1, "scale must be 1D"); | ||||
|  | ||||
|   TORCH_CHECK( | ||||
|       x.size(1) == w_int8.size(1), | ||||
|       "K dimension mismatch: x.size(1) != w.size(1)"); | ||||
|   TORCH_CHECK( | ||||
|       w_int8.size(0) == scale.size(0), | ||||
|       "Output dim mismatch: w.size(0) != scale.size(0)"); | ||||
|   TORCH_CHECK(x.size(1) == w_int8.size(1), "K dimension mismatch: x.size(1) != w.size(1)"); | ||||
|   TORCH_CHECK(w_int8.size(0) == scale.size(0), "Output dim mismatch: w.size(0) != scale.size(0)"); | ||||
|  | ||||
|   // --- Determine shapes --- | ||||
|   auto B = x.size(0); // batch size | ||||
|   auto N = w_int8.size(0); // output dim | ||||
|   auto B = x.size(0);  // batch size | ||||
|   auto N = w_int8.size(0);  // output dim | ||||
|  | ||||
|   // Ensure inputs are in the correct types for the kernel | ||||
|   auto x_f32 = x.to(at::kFloat); | ||||
| @ -83,13 +63,12 @@ at::Tensor _weight_int8pack_mm_cuda( | ||||
|   auto scale_f32 = scale.to(at::kFloat); | ||||
|  | ||||
|   // --- Allocate output --- | ||||
|   auto out = at::empty({B, N}, x_f32.options()); | ||||
|   auto out = at::empty({B, N}, x.options().dtype(at::kFloat)); | ||||
|  | ||||
|   // --- Launch kernel --- | ||||
|   launch_weight_int8pack_mm_cuda_kernel( | ||||
|       x_f32, w_int8_contiguous, scale_f32, out); | ||||
|   launch_weight_int8pack_mm_cuda_kernel(x_f32, w_int8_contiguous, scale_f32, out); | ||||
|  | ||||
|   return out.to(x.dtype()); | ||||
|   return out; | ||||
| } | ||||
|  | ||||
| } // namespace at::native | ||||
|  | ||||
| @ -482,9 +482,7 @@ auto build_graph( | ||||
|   auto scaled_dot_product_flash_attention_options = | ||||
|       fe::graph::SDPA_attributes() | ||||
|           .set_name("CUDNN_SDPA") | ||||
|           .set_is_inference(return_softmaxstats == false) | ||||
|           // TODO(eqy): switch to this API once cuDNN FE is upgraded | ||||
|           // .set_generate_stats(return_softmaxstats) | ||||
|           .set_generate_stats(return_softmaxstats) | ||||
|           .set_causal_mask(is_causal) | ||||
|           .set_attn_scale(attn_scale); | ||||
|   if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) { | ||||
| @ -704,9 +702,7 @@ auto build_graph_nestedtensor( | ||||
|   auto scaled_dot_product_flash_attention_options = | ||||
|       fe::graph::SDPA_attributes() | ||||
|           .set_name("CUDNN_SDPA_NESTEDTENSOR") | ||||
|           .set_is_inference(return_softmaxstats == false) | ||||
|           // TODO(eqy): switch to this API once cuDNN FE is upgraded | ||||
|           // .set_generate_stats(return_softmaxstats) | ||||
|           .set_generate_stats(return_softmaxstats) | ||||
|           .set_causal_mask(is_causal) | ||||
|           .set_attn_scale(attn_scale) | ||||
|           .set_seq_len_q(SEQ_LEN_Q_) | ||||
|  | ||||
| @ -2,7 +2,6 @@ | ||||
| #include <ATen/core/Tensor.h> | ||||
| #include <ATen/TensorUtils.h> | ||||
| #include <ATen/div_rtn.h> | ||||
| #include <c10/util/safe_numerics.h> | ||||
|  | ||||
| namespace at::native { | ||||
|  | ||||
| @ -55,14 +54,6 @@ inline void col2im_shape_check( | ||||
|  | ||||
|   int64_t batch_dim = (ndim == 3) ? 0 : -1; | ||||
|   int64_t n_input_plane = input.size(batch_dim + 1); | ||||
|   uint64_t prod_kernel_size = 1; | ||||
|  | ||||
|   TORCH_CHECK(!c10::mul_overflows(static_cast<uint64_t>(kernel_width), static_cast<uint64_t>(kernel_height), &prod_kernel_size), | ||||
|             "Given kernel_width = ", | ||||
|             kernel_width, | ||||
|             " and kernel_height = ", | ||||
|             kernel_height, | ||||
|             " the product of kernel_width and kernel_height overflowed."); | ||||
|  | ||||
|   if (n_input_plane % (kernel_width * kernel_height) != 0) { | ||||
|     TORCH_CHECK(false, | ||||
|  | ||||
| @ -559,60 +559,4 @@ Tensor _int_mm_xpu(const Tensor& self, const Tensor& mat2) { | ||||
|       at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt)); | ||||
|   return _int_mm_out_xpu(self, mat2, result); | ||||
| } | ||||
|  | ||||
| Tensor _weight_int8pack_mm_xpu( | ||||
|     const Tensor& A, | ||||
|     const Tensor& B, | ||||
|     const Tensor& scales) { | ||||
|   auto M = A.size(0); | ||||
|   auto N = B.size(0); | ||||
|   auto K = A.size(1); | ||||
|  | ||||
|   TORCH_CHECK( | ||||
|       A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat, | ||||
|       " : expect A to be either 32-bit or 16-bit float tensor."); | ||||
|   TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor."); | ||||
|   TORCH_CHECK( | ||||
|       A.stride(1) == 1, " : A must be contiguous on the last dimension."); | ||||
|   TORCH_CHECK(B.dtype() == kChar, " : expect B to be int8 tensor."); | ||||
|   TORCH_CHECK(B.is_contiguous(), " : expect B to be contiguous."); | ||||
|   TORCH_CHECK(B.size(1) == K, " : expect B.size(1) == ", K); | ||||
|  | ||||
|   TORCH_CHECK( | ||||
|       scales.dim() == 1 && scales.size(0) == N, | ||||
|       " : expect scales to be 1d tensor with size ", | ||||
|       N); | ||||
|  | ||||
|   auto C = at::empty({M, N}, A.options()); | ||||
|  | ||||
|   // --- Launch kernel --- | ||||
|   Tensor bias = at::Tensor(); | ||||
|   Tensor mat2_zero_points = at::Tensor(); | ||||
|   Tensor non_const_scales = scales; | ||||
|   auto post_op_args = torch::List<std::optional<at::Scalar>>(); | ||||
|  | ||||
|   at::native::onednn::quantized_matmul( | ||||
|       A.contiguous(), | ||||
|       1.0, | ||||
|       0, | ||||
|       B, | ||||
|       non_const_scales, | ||||
|       mat2_zero_points, | ||||
|       bias, | ||||
|       C, | ||||
|       1.0, | ||||
|       0, | ||||
|       C.scalar_type(), | ||||
|       /*other*/ std::nullopt, | ||||
|       /*other scale*/ 1.0, | ||||
|       /*other zp*/ 0, | ||||
|       /*binary post op*/ "none", | ||||
|       /*binary alpha*/ 1.0, | ||||
|       /*post_op_name*/ "none", | ||||
|       post_op_args, | ||||
|       /*post_op_algorithm*/ "none", | ||||
|       /*m2_trans*/ false); | ||||
|  | ||||
|   return C; | ||||
| } | ||||
| } // namespace at::native | ||||
|  | ||||
| @ -110,9 +110,8 @@ void quantized_matmul( | ||||
|   // [Note] Quantized Matrix Multiplication at XPU | ||||
|   // The following code integrates oneDNN quantized gemm. The quantization | ||||
|   // config we support: | ||||
|   // activation: s8, u8, fp16, bf16, fp32; per tensor calibrated; | ||||
|   // symmetric&asymmetric weight: s8; per_tensor/per_channel calibrated; | ||||
|   // symmetric | ||||
|   // activation: s8&u8; per tensor calibrated; symmetric&asymmetric | ||||
|   // weight: s8; per_tensor/per_channel calibrated; symmetric | ||||
|   auto attr = Attr(static_cast<float>(1.0 / output_scale), output_zero_point); | ||||
|   construct_attr_by_post_op( | ||||
|       binary_post_op, | ||||
|  | ||||
| @ -1,25 +0,0 @@ | ||||
| #pragma once | ||||
| #include <c10/metal/common.h> | ||||
|  | ||||
| #ifdef __METAL__ | ||||
| enum class EmbeddingBagMode { SUM = 0, MEAN, MAX }; | ||||
| #else | ||||
| #include <ATen/native/EmbeddingBag.h> | ||||
| using at::native::EmbeddingBagMode; | ||||
| #endif | ||||
|  | ||||
| template <typename idx_type_t = uint32_t> | ||||
| struct EmbeddingBagParams { | ||||
|   ::c10::metal::array<idx_type_t, 2> weight_strides; | ||||
|   ::c10::metal::array<idx_type_t, 2> output_strides; | ||||
|   ::c10::metal::array<idx_type_t, 2> max_indices_strides; | ||||
|  | ||||
|   idx_type_t per_sample_weights_stride; | ||||
|  | ||||
|   idx_type_t num_indices; | ||||
|   idx_type_t num_bags; | ||||
|   idx_type_t feature_size; | ||||
|  | ||||
|   EmbeddingBagMode mode; | ||||
|   int64_t padding_idx; | ||||
| }; | ||||
| @ -1,261 +0,0 @@ | ||||
| #include <ATen/native/mps/kernels/EmbeddingBag.h> | ||||
| #include <c10/metal/utils.h> | ||||
| #include <metal_array> | ||||
| #include <metal_stdlib> | ||||
|  | ||||
| using namespace metal; | ||||
| using namespace c10::metal; | ||||
|  | ||||
| template <EmbeddingBagMode M, typename T> | ||||
| struct ReductionOpInit { | ||||
|   inline opmath_t<T> operator()() { | ||||
|     return 0; | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <typename T> | ||||
| struct ReductionOpInit<EmbeddingBagMode::MAX, T> { | ||||
|   inline opmath_t<T> operator()() { | ||||
|     return static_cast<opmath_t<T>>(-INFINITY); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <EmbeddingBagMode M, typename T> | ||||
| struct ReductionOp { | ||||
|   inline opmath_t<T> operator()( | ||||
|       opmath_t<T> weight_val, | ||||
|       opmath_t<T> out_val, | ||||
|       bool is_first) { | ||||
|     return weight_val + out_val; | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <typename T> | ||||
| struct ReductionOp<EmbeddingBagMode::MAX, T> { | ||||
|   inline opmath_t<T> operator()( | ||||
|       opmath_t<T> weight_val, | ||||
|       opmath_t<T> out_val, | ||||
|       bool is_first) { | ||||
|     return (is_first || weight_val > out_val) ? weight_val : out_val; | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <EmbeddingBagMode M, typename T> | ||||
| struct MaybeApplyPerSampleWeight { | ||||
|   inline opmath_t<T> operator()( | ||||
|       opmath_t<T> weight_val, | ||||
|       uint32_t per_sample_weights_index, | ||||
|       constant T* per_sample_weights, | ||||
|       uint32_t per_sample_weights_stride) { | ||||
|     return weight_val; | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <typename T> | ||||
| struct MaybeApplyPerSampleWeight<EmbeddingBagMode::SUM, T> { | ||||
|   inline opmath_t<T> operator()( | ||||
|       opmath_t<T> weight_val, | ||||
|       uint32_t per_sample_weights_index, | ||||
|       constant T* per_sample_weights, | ||||
|       uint32_t per_sample_weights_stride) { | ||||
|     if (per_sample_weights_stride) { | ||||
|       T per_sample_weight = per_sample_weights | ||||
|           [per_sample_weights_stride * per_sample_weights_index]; | ||||
|       return static_cast<opmath_t<T>>(per_sample_weight) * weight_val; | ||||
|     } else { | ||||
|       return weight_val; | ||||
|     } | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <EmbeddingBagMode M, typename T, typename I> | ||||
| struct MaybeCalcMaxIndex { | ||||
|   inline void operator()( | ||||
|       opmath_t<T> weight_val, | ||||
|       opmath_t<T> out_val, | ||||
|       bool is_first, | ||||
|       thread I& max_idx, | ||||
|       I weight_idx, | ||||
|       bool pad) {} | ||||
| }; | ||||
|  | ||||
| template <typename T, typename I> | ||||
| struct MaybeCalcMaxIndex<EmbeddingBagMode::MAX, T, I> { | ||||
|   inline void operator()( | ||||
|       opmath_t<T> weight_val, | ||||
|       opmath_t<T> out_val, | ||||
|       bool is_first, | ||||
|       thread I& max_idx, | ||||
|       I weight_idx, | ||||
|       bool pad) { | ||||
|     max_idx = !pad && (is_first || weight_val > out_val) ? weight_idx : max_idx; | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <EmbeddingBagMode M, typename T> | ||||
| struct ReductionOpFinal { | ||||
|   inline T operator()(opmath_t<T> val, uint32_t) { | ||||
|     return static_cast<T>(val); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <typename T> | ||||
| struct ReductionOpFinal<EmbeddingBagMode::MEAN, T> { | ||||
|   inline T operator()(opmath_t<T> val, uint32_t count) { | ||||
|     auto out = val / count; | ||||
|     return static_cast<T>((count == 0) ? 0 : out); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <typename T> | ||||
| struct ReductionOpFinal<EmbeddingBagMode::MAX, T> { | ||||
|   inline T operator()(opmath_t<T> val, uint32_t count) { | ||||
|     return static_cast<T>((count == 0) ? 0 : val); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <EmbeddingBagMode M, typename I> | ||||
| struct MaybeWriteMaxIndex { | ||||
|   inline void operator()( | ||||
|       device I*, | ||||
|       const constant ::c10::metal::array<uint32_t, 2>&, | ||||
|       uint32_t, | ||||
|       uint32_t, | ||||
|       I) {} | ||||
| }; | ||||
|  | ||||
| template <typename I> | ||||
| struct MaybeWriteMaxIndex<EmbeddingBagMode::MAX, I> { | ||||
|   inline void operator()( | ||||
|       device I* max_indices, | ||||
|       const constant ::c10::metal::array<uint32_t, 2>& max_indices_strides, | ||||
|       uint32_t bag_idx, | ||||
|       uint32_t feature_idx, | ||||
|       I max_idx) { | ||||
|     max_indices | ||||
|         [bag_idx * max_indices_strides[0] + | ||||
|          feature_idx * max_indices_strides[1]] = max_idx; | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <EmbeddingBagMode M, typename T, typename I> | ||||
| void embedding_bag_impl( | ||||
|     constant T* weight, | ||||
|     constant I* indices, | ||||
|     constant I* offsets, | ||||
|     constant T* per_sample_weights, | ||||
|     device T* output, | ||||
|     device I* offset2bag, | ||||
|     device I* bag_size, | ||||
|     device I* max_indices, | ||||
|     constant EmbeddingBagParams<uint32_t>& params, | ||||
|     uint tid) { | ||||
|   auto num_indices = params.num_indices; | ||||
|   auto num_bags = params.num_bags; | ||||
|   auto feature_size = params.feature_size; | ||||
|   auto padding_idx = params.padding_idx; | ||||
|   auto per_sample_weights_stride = params.per_sample_weights_stride; | ||||
|   constant auto& output_strides = params.output_strides; | ||||
|   constant auto& weight_strides = params.weight_strides; | ||||
|   constant auto& max_indices_strides = params.max_indices_strides; | ||||
|  | ||||
|   auto bag_idx = tid / feature_size; | ||||
|   auto feature_idx = tid % feature_size; | ||||
|  | ||||
|   uint32_t offsets_end = min(bag_idx + 1, num_bags - 1); | ||||
|   bool is_last_bag = bag_idx + 1 == num_bags; | ||||
|   uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]); | ||||
|   uint32_t indices_end = is_last_bag * (num_indices) + | ||||
|       (!is_last_bag) * (static_cast<uint32_t>(offsets[offsets_end])); | ||||
|  | ||||
|   auto out_val = ReductionOpInit<M, T>()(); | ||||
|  | ||||
|   uint32_t bag_size_ = 0; | ||||
|   I max_idx = 0; | ||||
|  | ||||
|   for (uint32_t indices_idx = indices_start; indices_idx < indices_end; | ||||
|        indices_idx++) { | ||||
|     I weight_idx = indices[indices_idx]; | ||||
|     bool pad = (weight_idx == padding_idx); | ||||
|     auto weight_val = static_cast<opmath_t<T>>( | ||||
|         weight | ||||
|             [static_cast<uint32_t>(weight_idx) * weight_strides[0] + | ||||
|              feature_idx * weight_strides[1]]); | ||||
|  | ||||
|     weight_val = MaybeApplyPerSampleWeight<M, T>()( | ||||
|         weight_val, indices_idx, per_sample_weights, per_sample_weights_stride); | ||||
|  | ||||
|     auto new_out_val = ReductionOp<M, T>()(weight_val, out_val, bag_size_ == 0); | ||||
|  | ||||
|     MaybeCalcMaxIndex<M, T, I>()( | ||||
|         weight_val, out_val, bag_size_ == 0, max_idx, weight_idx, pad); | ||||
|  | ||||
|     out_val = pad ? out_val : new_out_val; | ||||
|     offset2bag[indices_idx] = bag_idx; | ||||
|     bag_size_ += static_cast<uint32_t>(!pad); | ||||
|   } | ||||
|  | ||||
|   output[bag_idx * output_strides[0] + feature_idx * output_strides[1]] = | ||||
|       ReductionOpFinal<M, T>()(out_val, bag_size_); | ||||
|  | ||||
|   bag_size[bag_idx] = bag_size_; | ||||
|  | ||||
|   MaybeWriteMaxIndex<M, I>()( | ||||
|       max_indices, max_indices_strides, bag_idx, feature_idx, max_idx); | ||||
| } | ||||
|  | ||||
| #define DISPATCH_IMPL(MODE)        \ | ||||
|   return embedding_bag_impl<MODE>( \ | ||||
|       weight,                      \ | ||||
|       indices,                     \ | ||||
|       offsets,                     \ | ||||
|       per_sample_weights,          \ | ||||
|       output,                      \ | ||||
|       offset2bag,                  \ | ||||
|       bag_size,                    \ | ||||
|       max_indices,                 \ | ||||
|       params,                      \ | ||||
|       tid) | ||||
|  | ||||
| template <typename T, typename I> | ||||
| kernel void embedding_bag( | ||||
|     constant T* weight [[buffer(0)]], | ||||
|     constant I* indices [[buffer(1)]], | ||||
|     constant I* offsets [[buffer(2)]], | ||||
|     constant T* per_sample_weights [[buffer(3)]], | ||||
|     device T* output [[buffer(4)]], | ||||
|     device I* offset2bag [[buffer(5)]], | ||||
|     device I* bag_size [[buffer(6)]], | ||||
|     device I* max_indices [[buffer(7)]], | ||||
|     constant EmbeddingBagParams<uint32_t>& params [[buffer(8)]], | ||||
|     uint tid [[thread_position_in_grid]]) { | ||||
|   switch (params.mode) { | ||||
|     case EmbeddingBagMode::SUM: | ||||
|       DISPATCH_IMPL(EmbeddingBagMode::SUM); | ||||
|     case EmbeddingBagMode::MEAN: | ||||
|       DISPATCH_IMPL(EmbeddingBagMode::MEAN); | ||||
|     case EmbeddingBagMode::MAX: | ||||
|       DISPATCH_IMPL(EmbeddingBagMode::MAX); | ||||
|   } | ||||
| } | ||||
|  | ||||
| #define REGISTER_EMBEDDING_BAG_OP(T, I)                             \ | ||||
|   template [[host_name("embedding_bag_" #T "_" #I)]]                \ | ||||
|   kernel void embedding_bag<T, I>(                                  \ | ||||
|       constant T * weight [[buffer(0)]],                            \ | ||||
|       constant I * indices [[buffer(1)]],                           \ | ||||
|       constant I * offsets [[buffer(2)]],                           \ | ||||
|       constant T * per_sample_weights [[buffer(3)]],                \ | ||||
|       device T * output [[buffer(4)]],                              \ | ||||
|       device I * offset2bag [[buffer(5)]],                          \ | ||||
|       device I * bag_size [[buffer(6)]],                            \ | ||||
|       device I * max_indices [[buffer(7)]],                         \ | ||||
|       constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]], \ | ||||
|       uint tid [[thread_position_in_grid]]); | ||||
|  | ||||
| REGISTER_EMBEDDING_BAG_OP(float, int); | ||||
| REGISTER_EMBEDDING_BAG_OP(float, long); | ||||
| REGISTER_EMBEDDING_BAG_OP(half, int); | ||||
| REGISTER_EMBEDDING_BAG_OP(half, long); | ||||
| REGISTER_EMBEDDING_BAG_OP(bfloat, int); | ||||
| REGISTER_EMBEDDING_BAG_OP(bfloat, long); | ||||
| @ -198,7 +198,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_, | ||||
|  | ||||
|     if (input_t.is_contiguous(memory_format) && output_t.is_contiguous(memory_format) && is_macOS_15_0_or_newer) { | ||||
|       inputNDArray = getMPSNDArray(input_t, inputShape); | ||||
|       outputNDArray = getMPSNDArray(output_t, outputShape); | ||||
|       outputNDArray = getMPSNDArray(*output, outputShape); | ||||
|     } | ||||
|  | ||||
|     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) { | ||||
| @ -302,7 +302,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_, | ||||
|       } | ||||
|     } | ||||
|     auto outputPlaceholder = outputNDArray ? Placeholder(cachedGraph->outputTensor_, outputNDArray) | ||||
|                                            : Placeholder(cachedGraph->outputTensor_, output_t); | ||||
|                                            : Placeholder(cachedGraph->outputTensor_, *output); | ||||
|  | ||||
|     NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = | ||||
|         [[[NSMutableDictionary alloc] initWithCapacity:3] autorelease]; | ||||
| @ -315,7 +315,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_, | ||||
|     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); | ||||
|   } | ||||
|  | ||||
|   return output_t; | ||||
|   return *output; | ||||
| } | ||||
|  | ||||
| Tensor _mps_convolution(const Tensor& input_t, | ||||
|  | ||||
| @ -1,180 +0,0 @@ | ||||
| #define TORCH_ASSERT_ONLY_METHOD_OPERATORS | ||||
| #include <ATen/TensorUtils.h> | ||||
| #include <ATen/core/Tensor.h> | ||||
| #include <ATen/mps/MPSProfiler.h> | ||||
| #include <ATen/native/EmbeddingBag.h> | ||||
| #include <ATen/native/Pool.h> | ||||
| #include <ATen/native/mps/OperationUtils.h> | ||||
| #include <ATen/native/mps/kernels/EmbeddingBag.h> | ||||
|  | ||||
| #include <fmt/format.h> | ||||
|  | ||||
| #ifndef AT_PER_OPERATOR_HEADERS | ||||
| #include <ATen/Functions.h> | ||||
| #include <ATen/NativeFunctions.h> | ||||
| #else | ||||
| #include <ATen/ops/_embedding_bag_forward_only_native.h> | ||||
| #include <ATen/ops/_embedding_bag_native.h> | ||||
| #include <ATen/ops/empty.h> | ||||
| #endif | ||||
|  | ||||
| namespace at::native { | ||||
|  | ||||
| #ifndef PYTORCH_JIT_COMPILE_SHADERS | ||||
| static auto& lib = mps::MetalShaderLibrary::getBundledLibrary(); | ||||
| #else | ||||
| #include <ATen/native/mps/EmbeddingBag_metallib.h> | ||||
| #endif | ||||
|  | ||||
| namespace { | ||||
|  | ||||
| std::pair<Tensor, Tensor> promoteIndicesAndOffsets(const Tensor& indices, const Tensor& offsets) { | ||||
|   const auto commonType = promoteTypes(offsets.scalar_type(), indices.scalar_type()); | ||||
|   return {indices.scalar_type() == commonType ? indices : indices.toType(commonType), | ||||
|           offsets.scalar_type() == commonType ? offsets : offsets.toType(commonType)}; | ||||
| } | ||||
|  | ||||
| } // namespace | ||||
|  | ||||
| namespace mps { | ||||
|  | ||||
| static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl( | ||||
|     const Tensor& weight, | ||||
|     const Tensor& indices_, | ||||
|     const Tensor& offsets_, | ||||
|     const bool scale_grad_by_freq, | ||||
|     const int64_t mode, | ||||
|     bool sparse, | ||||
|     const std::optional<Tensor>& per_sample_weights_opt, | ||||
|     bool include_last_offset, | ||||
|     int64_t padding_idx) { | ||||
|   TORCH_CHECK(indices_.dim() == 1, "input has to be a 1D Tensor, but got Tensor of dimension ", indices_.dim()); | ||||
|   if (indices_.dim() == 1) { | ||||
|     TORCH_CHECK(offsets_.dim() == 1, "offsets has to be a 1D Tensor, but got Tensor of dimension ", offsets_.dim()); | ||||
|   } | ||||
|   TORCH_CHECK(weight.dim() == 2, "weight has to be a 2D Tensor, but got Tensor of dimension ", weight.dim()); | ||||
|  | ||||
|   Tensor indices, offsets; | ||||
|   std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_); | ||||
|   auto indices_arg = TensorArg(indices, "indices", 1); | ||||
|   checkScalarTypes("embedding_bag_mps", indices_arg, {kLong, kInt}); | ||||
|   auto offsets_arg = TensorArg(offsets, "offsets", 1); | ||||
|   checkScalarTypes("embedding_bag_mps", offsets_arg, {kLong, kInt}); | ||||
|   checkSameType("embedding_bag_mps", indices_arg, offsets_arg); | ||||
|   auto weight_arg = TensorArg(weight, "weight", 1); | ||||
|  | ||||
|   int64_t num_indices = indices.size(0); | ||||
|   int64_t num_bags = offsets.size(0); | ||||
|   if (include_last_offset) { | ||||
|     TORCH_CHECK(num_bags >= 1, "include_last_offset: number of offsets should be at least 1"); | ||||
|     num_bags -= 1; | ||||
|   } | ||||
|   int64_t feature_size = weight.size(1); | ||||
|  | ||||
|   auto bag_size = at::empty({num_bags}, indices.options()); | ||||
|   auto offset2bag = at::empty({indices.size(0)}, indices.options()); | ||||
|   auto output = at::empty({num_bags, feature_size}, weight.options()); | ||||
|  | ||||
|   Tensor max_indices; | ||||
|  | ||||
|   if (mode == EmbeddingBagMode::MAX) { | ||||
|     max_indices = at::empty({num_bags, feature_size}, indices.options()); | ||||
|   } else { | ||||
|     max_indices = at::empty({0}, indices.options()); | ||||
|   } | ||||
|  | ||||
|   EmbeddingBagParams<uint32_t> params; | ||||
|  | ||||
|   for (const auto dim : c10::irange(weight.dim())) { | ||||
|     params.weight_strides[dim] = safe_downcast<uint32_t, int64_t>(weight.stride(dim)); | ||||
|     params.output_strides[dim] = safe_downcast<uint32_t, int64_t>(output.stride(dim)); | ||||
|  | ||||
|     if (mode == EmbeddingBagMode::MAX) { | ||||
|       params.max_indices_strides[dim] = safe_downcast<uint32_t, int64_t>(max_indices.stride(dim)); | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined(); | ||||
|   params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0; | ||||
|  | ||||
|   params.num_indices = num_indices; | ||||
|   params.num_bags = num_bags; | ||||
|   params.feature_size = feature_size; | ||||
|   params.mode = static_cast<EmbeddingBagMode>(mode); | ||||
|   params.padding_idx = padding_idx; | ||||
|  | ||||
|   auto num_threads = output.numel(); | ||||
|   MPSStream* stream = getCurrentMPSStream(); | ||||
|  | ||||
|   dispatch_sync_with_rethrow(stream->queue(), ^() { | ||||
|     @autoreleasepool { | ||||
|       id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder(); | ||||
|       auto pipeline_state = lib.getPipelineStateForFunc( | ||||
|           fmt::format("embedding_bag_{}_{}", scalarToMetalTypeString(weight), scalarToMetalTypeString(indices))); | ||||
|  | ||||
|       getMPSProfiler().beginProfileKernel(pipeline_state, "embedding_bag", {weight, indices, offsets}); | ||||
|       [computeEncoder setComputePipelineState:pipeline_state]; | ||||
|       mtl_setArgs(computeEncoder, | ||||
|                   weight, | ||||
|                   indices, | ||||
|                   offsets, | ||||
|                   use_per_sample_weights ? per_sample_weights_opt : std::nullopt, | ||||
|                   output, | ||||
|                   offset2bag, | ||||
|                   bag_size, | ||||
|                   max_indices, | ||||
|                   params); | ||||
|  | ||||
|       mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads); | ||||
|       getMPSProfiler().endProfileKernel(pipeline_state); | ||||
|     } | ||||
|   }); | ||||
|  | ||||
|   return std::tuple<Tensor, Tensor, Tensor, Tensor>( | ||||
|       std::move(output), std::move(offset2bag), std::move(bag_size), std::move(max_indices)); | ||||
| } | ||||
|  | ||||
| } // namespace mps | ||||
|  | ||||
| std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps(const Tensor& weight, | ||||
|                                                               const Tensor& indices, | ||||
|                                                               const Tensor& offsets, | ||||
|                                                               const bool scale_grad_by_freq, | ||||
|                                                               const int64_t mode, | ||||
|                                                               bool sparse, | ||||
|                                                               const std::optional<Tensor>& per_sample_weights_opt, | ||||
|                                                               bool include_last_offset, | ||||
|                                                               int64_t padding_idx) { | ||||
|   return mps::_embedding_bag_mps_impl(weight, | ||||
|                                       indices, | ||||
|                                       offsets, | ||||
|                                       scale_grad_by_freq, | ||||
|                                       mode, | ||||
|                                       sparse, | ||||
|                                       per_sample_weights_opt, | ||||
|                                       include_last_offset, | ||||
|                                       padding_idx); | ||||
| } | ||||
|  | ||||
| std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_forward_only_mps( | ||||
|     const Tensor& weight, | ||||
|     const Tensor& indices, | ||||
|     const Tensor& offsets, | ||||
|     const bool scale_grad_by_freq, | ||||
|     const int64_t mode, | ||||
|     bool sparse, | ||||
|     const std::optional<Tensor>& per_sample_weights_opt, | ||||
|     bool include_last_offset, | ||||
|     int64_t padding_idx) { | ||||
|   return _embedding_bag_mps(weight, | ||||
|                             indices, | ||||
|                             offsets, | ||||
|                             scale_grad_by_freq, | ||||
|                             mode, | ||||
|                             sparse, | ||||
|                             per_sample_weights_opt, | ||||
|                             include_last_offset, | ||||
|                             padding_idx); | ||||
| } | ||||
|  | ||||
| } // namespace at::native | ||||
| @ -20,7 +20,6 @@ | ||||
| #include <ATen/ops/baddbmm_native.h> | ||||
| #include <ATen/ops/bmm_native.h> | ||||
| #include <ATen/ops/cholesky_native.h> | ||||
| #include <ATen/ops/eye_native.h> | ||||
| #include <ATen/ops/linalg_cholesky_ex_native.h> | ||||
| #include <ATen/ops/linalg_inv_ex_native.h> | ||||
| #include <ATen/ops/linalg_lu_factor_ex_native.h> | ||||
| @ -497,24 +496,26 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const | ||||
|   using namespace mps; | ||||
|   TORCH_CHECK(result.is_mps(), "Output tensor is not MPS"); | ||||
|   TORCH_CHECK(!A.is_complex(), "linalg_inv: not supported for complex types yet!"); | ||||
|   using CachedGraph = MPSUnaryCachedGraph; | ||||
|  | ||||
|   MPSStream* stream = getCurrentMPSStream(); | ||||
|   info.zero_(); | ||||
|  | ||||
|   if (A.numel() == 0) { | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   if (!result.is_contiguous()) { | ||||
|     result.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous); | ||||
|   } | ||||
|   auto A_sizes = A.sizes(); | ||||
|   int ndim = A.dim(); | ||||
|  | ||||
|   Tensor LU = empty_like(A, MemoryFormat::Contiguous); | ||||
|   Tensor identity = eye(A.size(-2), A.size(-1), A.scalar_type(), A.options().layout(), A.device()).expand_as(A); | ||||
|   Tensor LU = empty_like(A); | ||||
|   Tensor identity = zeros_like(A); | ||||
|   Tensor pivots = empty({A_sizes.begin(), A_sizes.end() - 1}, A.options().dtype(kInt)); | ||||
|   // need to do this to keep the strides of the result tensor | ||||
|   // mps's solve expects row major layout, while inductor | ||||
|   // expects result to be column major | ||||
|   Tensor tmp = empty_like(A, MemoryFormat::Contiguous); | ||||
|   linalg_solve_out_mps_impl(A, identity, true, check_errors, tmp, LU, pivots, info); | ||||
|   result.copy_(tmp); | ||||
|   (ndim == 2 ? identity.diagonal() : identity.diagonal(0, -2, -1)).fill_(1); | ||||
|   linalg_solve_out_mps_impl(A, identity, true, check_errors, result, LU, pivots, info); | ||||
| } | ||||
|  | ||||
| static Tensor& mm_out_mps_impl(const Tensor& self, const Tensor& other, Tensor& output) { | ||||
|  | ||||
| @ -519,13 +519,6 @@ static void max_unpool_out_mps_template(const Tensor& input, | ||||
|                                         Tensor& output, | ||||
|                                         const int32_t pooling_dims, | ||||
|                                         const std::string& op_name) { | ||||
|   TORCH_CHECK(output_size_.size() == static_cast<size_t>(pooling_dims), | ||||
|               op_name, | ||||
|               "There should be exactly ", | ||||
|               pooling_dims, | ||||
|               " elements but got ", | ||||
|               output_size_.size()); | ||||
|  | ||||
|   auto dims = input.dim(); | ||||
|   auto leading_dims = input.dim() - pooling_dims; | ||||
|  | ||||
| @ -541,18 +534,6 @@ static void max_unpool_out_mps_template(const Tensor& input, | ||||
|   output.resize_(output_size, memory_format); | ||||
|   output.fill_(0); | ||||
|  | ||||
|   if (indices.defined() && indices.numel() > 0) { | ||||
|     auto output_image_size = c10::multiply_integers(output_size_); | ||||
|  | ||||
|     int64_t min_idx = indices.min().item<int64_t>(); | ||||
|     int64_t max_idx = indices.max().item<int64_t>(); | ||||
|  | ||||
|     if (min_idx < 0 || max_idx >= output_image_size) { | ||||
|       int64_t error_idx = (min_idx < 0) ? min_idx : max_idx; | ||||
|       TORCH_CHECK(false, "Found an invalid max index: ", error_idx, " for output tensor of shape ", output_size_); | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   id<MTLDevice> device = MPSDevice::getInstance()->device(); | ||||
|   MPSStream* mpsStream = getCurrentMPSStream(); | ||||
|   const auto numThreads = input.numel(); | ||||
|  | ||||
| @ -2351,7 +2351,6 @@ | ||||
|   dispatch: | ||||
|     CPU: _embedding_bag_forward_only_cpu | ||||
|     CUDA: _embedding_bag_forward_only_cuda | ||||
|     MPS: _embedding_bag_forward_only_mps | ||||
|   autogen: _embedding_bag_forward_only.out | ||||
|  | ||||
| - func: _rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor) | ||||
| @ -2373,7 +2372,6 @@ | ||||
|   dispatch: | ||||
|     CPU: _embedding_bag_cpu | ||||
|     CUDA: _embedding_bag_cuda | ||||
|     MPS: _embedding_bag_mps | ||||
|   autogen: _embedding_bag.out | ||||
|   tags: core | ||||
|  | ||||
| @ -3858,7 +3856,7 @@ | ||||
|   device_check: NoCheck   # TensorIterator | ||||
|   structured: True | ||||
|   dispatch: | ||||
|     CPU, CUDA, MTIA: aminmax_out | ||||
|     CPU, CUDA: aminmax_out | ||||
|     MPS: aminmax_out_mps | ||||
|  | ||||
| - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor | ||||
| @ -3909,7 +3907,7 @@ | ||||
| - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   structured: True | ||||
|   dispatch: | ||||
|     CPU, CUDA, MTIA: amax_out | ||||
|     CPU, CUDA: amax_out | ||||
|     MPS: amax_out_mps | ||||
|  | ||||
| # Return: (Tensor output, Tensor indices) | ||||
| @ -4090,7 +4088,7 @@ | ||||
| - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   structured: True | ||||
|   dispatch: | ||||
|     CPU, CUDA, MTIA: amin_out | ||||
|     CPU, CUDA: amin_out | ||||
|     MPS: amin_out_mps | ||||
|  | ||||
| # TODO: Add this function to MPS dispatch key so that we avoid declaring it in | ||||
| @ -4243,7 +4241,6 @@ | ||||
|     CPU: _weight_int8pack_mm_cpu | ||||
|     CUDA: _weight_int8pack_mm_cuda | ||||
|     MPS: _weight_int8pack_mm_mps | ||||
|     XPU: _weight_int8pack_mm_xpu | ||||
|  | ||||
| - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor | ||||
|   python_module: sparse | ||||
| @ -4375,7 +4372,7 @@ | ||||
|   variants: function, method | ||||
|   dispatch: | ||||
|     CPU: narrow_copy_dense_cpu | ||||
|     SparseCPU, SparseCUDA, SparseMPS: narrow_copy_sparse | ||||
|     SparseCPU, SparseCUDA: narrow_copy_sparse | ||||
|     CompositeExplicitAutogradNonFunctional: narrow_copy_dense_symint | ||||
|   tags: view_copy | ||||
|  | ||||
| @ -6663,7 +6660,7 @@ | ||||
| - func: zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!) | ||||
|   dispatch: | ||||
|     CompositeExplicitAutograd: zeros_out | ||||
|     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: zeros_sparse_out | ||||
|     SparseCPU, SparseCUDA, SparseMeta: zeros_sparse_out | ||||
|  | ||||
| - func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor | ||||
|   dispatch: | ||||
| @ -10702,7 +10699,6 @@ | ||||
|   dispatch: | ||||
|     CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow | ||||
|     CUDA: foreach_tensor_div_list_kernel_cuda | ||||
|     MTIA: foreach_tensor_div_list_kernel_mtia | ||||
|  | ||||
| - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> () | ||||
|   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices | ||||
| @ -10710,7 +10706,6 @@ | ||||
|   dispatch: | ||||
|     CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow_ | ||||
|     CUDA: foreach_tensor_div_list_kernel_cuda_ | ||||
|     MTIA: foreach_tensor_div_list_kernel_mtia_ | ||||
|   autogen: _foreach_div.List_out | ||||
|  | ||||
| - func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[] | ||||
| @ -10734,7 +10729,6 @@ | ||||
|   dispatch: | ||||
|     CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow | ||||
|     CUDA: foreach_tensor_div_tensor_kernel_cuda | ||||
|     MTIA: foreach_tensor_div_tensor_kernel_mtia | ||||
|  | ||||
| - func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> () | ||||
|   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices | ||||
| @ -10742,7 +10736,6 @@ | ||||
|   dispatch: | ||||
|     CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow_ | ||||
|     CUDA: foreach_tensor_div_tensor_kernel_cuda_ | ||||
|     MTIA: foreach_tensor_div_tensor_kernel_mtia_ | ||||
|   autogen: _foreach_div.Tensor_out | ||||
|  | ||||
| - func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[] | ||||
| @ -10849,7 +10842,6 @@ | ||||
|   dispatch: | ||||
|     CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_ | ||||
|     CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_ | ||||
|     MTIA: foreach_tensor_maximum_scalar_kernel_mtia_ | ||||
|   autogen: _foreach_maximum.Scalar_out | ||||
|  | ||||
| # foreach_minimum/maximum dispatches to clamp_max/min | ||||
|  | ||||
| @ -1,6 +1,5 @@ | ||||
| #define TORCH_ASSERT_ONLY_METHOD_OPERATORS | ||||
| #include <ATen/core/Tensor.h> | ||||
| #include <ATen/Dispatch.h> | ||||
| #include <ATen/ceil_div.h> | ||||
| #include <ATen/native/cuda/Loops.cuh> | ||||
| #include <c10/cuda/CUDAGuard.h> | ||||
| @ -22,11 +21,10 @@ | ||||
| namespace at::native { | ||||
|  | ||||
| namespace { | ||||
| template <typename T> | ||||
| __global__ void ChooseQuantizationParamsKernelImpl( | ||||
|     const int64_t* fake_quant_on, | ||||
|     const T* x_min, | ||||
|     const T* x_max, | ||||
|     const float* x_min, | ||||
|     const float* x_max, | ||||
|     int32_t qmin, | ||||
|     int32_t qmax, | ||||
|     int size, | ||||
| @ -95,44 +93,34 @@ __global__ void ChooseQuantizationParamsKernelImpl( | ||||
|   } | ||||
| } | ||||
|  | ||||
| __device__ inline bool isinf_device(float v) { | ||||
|   return ::isinf(v); | ||||
| } | ||||
| __device__ inline bool isinf_device(c10::BFloat16 v) { | ||||
|   return ::isinf(static_cast<float>(v)); | ||||
| } | ||||
|  | ||||
| // CUDA kernel to compute Moving Average Min/Max of the tensor. | ||||
| // It uses the running_min and running_max along with averaging const, c. | ||||
| // The formula used to compute the new min/max is as follows | ||||
| // | ||||
| // running_min = (1 - c) * running_min + c * x_min, if running_min != inf | ||||
| // running_min = x_min, if running_min == inf | ||||
| template <typename T> | ||||
| __global__ void MovingAverageMinMax( | ||||
|     const int64_t* observer_on, | ||||
|     const T* x_min, | ||||
|     const T* x_max, | ||||
|     T* running_min, | ||||
|     T* running_max, | ||||
|     const float* x_min, | ||||
|     const float* x_max, | ||||
|     float* running_min, | ||||
|     float* running_max, | ||||
|     const float averaging_const, | ||||
|     const int size) { | ||||
|   int i = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|  | ||||
|   if (*observer_on == 1) { | ||||
|     if (i < size) { | ||||
|       T curr_min = x_min[i]; | ||||
|       T curr_max = x_max[i]; | ||||
|       float curr_min = x_min[i]; | ||||
|       float curr_max = x_max[i]; | ||||
|  | ||||
|       T averaging_const_t = static_cast<T>(averaging_const); | ||||
|       float adjusted_min = ::isinf(running_min[i]) | ||||
|           ? curr_min | ||||
|           : (running_min[i]) + averaging_const * (curr_min - (running_min[i])); | ||||
|  | ||||
|       T adjusted_min = isinf_device(running_min[i]) ? curr_min | ||||
|                                                     : (running_min[i]) + | ||||
|               averaging_const_t * (curr_min - (running_min[i])); | ||||
|  | ||||
|       T adjusted_max = isinf_device(running_max[i]) ? curr_max | ||||
|                                                     : (running_max[i]) + | ||||
|               averaging_const_t * (curr_max - (running_max[i])); | ||||
|       float adjusted_max = ::isinf(running_max[i]) | ||||
|           ? curr_max | ||||
|           : (running_max[i]) + averaging_const * (curr_max - (running_max[i])); | ||||
|  | ||||
|       running_min[i] = adjusted_min; | ||||
|       running_max[i] = adjusted_max; | ||||
| @ -154,51 +142,40 @@ void _calculate_moving_average( | ||||
|   at::Tensor x_min, x_max; | ||||
|  | ||||
|   int64_t* observer_on_data = observer_on.data_ptr<int64_t>(); | ||||
|   float* running_min_data = running_min.data_ptr<float>(); | ||||
|   float* running_max_data = running_max.data_ptr<float>(); | ||||
|   cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream(); | ||||
|  | ||||
|   if (per_row_fq) { | ||||
|     std::tie(x_min, x_max) = at::aminmax(x, 1); | ||||
|     float* x_min_data = x_min.data_ptr<float>(); | ||||
|     float* x_max_data = x_max.data_ptr<float>(); | ||||
|     int num_threads = std::min(size, (int64_t)512); | ||||
|     const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads); | ||||
|     AT_DISPATCH_FLOATING_TYPES_AND( | ||||
|         at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] { | ||||
|           scalar_t* x_min_data = x_min.data_ptr<scalar_t>(); | ||||
|           scalar_t* x_max_data = x_max.data_ptr<scalar_t>(); | ||||
|  | ||||
|           scalar_t* running_min_data = running_min.data_ptr<scalar_t>(); | ||||
|           scalar_t* running_max_data = running_max.data_ptr<scalar_t>(); | ||||
|  | ||||
|           // Moving Average Min/Max observer for activations | ||||
|           MovingAverageMinMax<<<num_blocks, num_threads, 0, cuda_stream>>>( | ||||
|               observer_on_data, | ||||
|               x_min_data, | ||||
|               x_max_data, | ||||
|               running_min_data, | ||||
|               running_max_data, | ||||
|               averaging_const, | ||||
|               size); | ||||
|         }); | ||||
|     // Moving Average Min/Max observer for activations | ||||
|     MovingAverageMinMax<<<num_blocks, num_threads, 0, cuda_stream>>>( | ||||
|         observer_on_data, | ||||
|         x_min_data, | ||||
|         x_max_data, | ||||
|         running_min_data, | ||||
|         running_max_data, | ||||
|         averaging_const, | ||||
|         size); | ||||
|     C10_CUDA_KERNEL_LAUNCH_CHECK(); | ||||
|   } else { | ||||
|     std::tie(x_min, x_max) = at::aminmax(x); | ||||
|     AT_DISPATCH_FLOATING_TYPES_AND( | ||||
|         at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] { | ||||
|           scalar_t* x_min_data = x_min.data_ptr<scalar_t>(); | ||||
|           scalar_t* x_max_data = x_max.data_ptr<scalar_t>(); | ||||
|  | ||||
|           scalar_t* running_min_data = running_min.data_ptr<scalar_t>(); | ||||
|           scalar_t* running_max_data = running_max.data_ptr<scalar_t>(); | ||||
|  | ||||
|           // Moving Average Min/Max observer for activations | ||||
|           MovingAverageMinMax<<<1, 1, 0, cuda_stream>>>( | ||||
|               observer_on_data, | ||||
|               x_min_data, | ||||
|               x_max_data, | ||||
|               running_min_data, | ||||
|               running_max_data, | ||||
|               averaging_const, | ||||
|               1 /*size*/); | ||||
|         }); | ||||
|     float* x_min_data = x_min.data_ptr<float>(); | ||||
|     float* x_max_data = x_max.data_ptr<float>(); | ||||
|     // Moving Average Min/Max observer for activations | ||||
|     MovingAverageMinMax<<<1, 1, 0, cuda_stream>>>( | ||||
|         observer_on_data, | ||||
|         x_min_data, | ||||
|         x_max_data, | ||||
|         running_min_data, | ||||
|         running_max_data, | ||||
|         averaging_const, | ||||
|         1 /*size*/); | ||||
|     C10_CUDA_KERNEL_LAUNCH_CHECK(); | ||||
|   } | ||||
| } | ||||
| @ -221,44 +198,34 @@ void _calc_moving_avg_qparams_helper( | ||||
|   cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream(); | ||||
|   int64_t* fake_quant_on_data = fake_quant_on.data_ptr<int64_t>(); | ||||
|   if (per_row_fq) { | ||||
|     AT_DISPATCH_FLOATING_TYPES_AND( | ||||
|         at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] { | ||||
|           scalar_t* running_min_data = running_min.data_ptr<scalar_t>(); | ||||
|           scalar_t* running_max_data = running_max.data_ptr<scalar_t>(); | ||||
|           int num_threads = std::min(size, (int64_t)512); | ||||
|           const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads); | ||||
|           ChooseQuantizationParamsKernelImpl<<< | ||||
|               num_blocks, | ||||
|               num_threads, | ||||
|               0, | ||||
|               cuda_stream>>>( | ||||
|               fake_quant_on_data, | ||||
|               running_min_data, | ||||
|               running_max_data, | ||||
|               qmin, | ||||
|               qmax, | ||||
|               size, | ||||
|               symmetric_quant, | ||||
|               scale_ptr, | ||||
|               zp_ptr); | ||||
|         }); | ||||
|     float* running_min_data = running_min.data_ptr<float>(); | ||||
|     float* running_max_data = running_max.data_ptr<float>(); | ||||
|     int num_threads = std::min(size, (int64_t)512); | ||||
|     const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads); | ||||
|     ChooseQuantizationParamsKernelImpl<<<num_blocks, num_threads, 0, cuda_stream>>>( | ||||
|         fake_quant_on_data, | ||||
|         running_min_data, | ||||
|         running_max_data, | ||||
|         qmin, | ||||
|         qmax, | ||||
|         size, | ||||
|         symmetric_quant, | ||||
|         scale_ptr, | ||||
|         zp_ptr); | ||||
|     C10_CUDA_KERNEL_LAUNCH_CHECK(); | ||||
|   } else { | ||||
|     AT_DISPATCH_FLOATING_TYPES_AND( | ||||
|         at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] { | ||||
|           scalar_t* running_min_data = running_min.data_ptr<scalar_t>(); | ||||
|           scalar_t* running_max_data = running_max.data_ptr<scalar_t>(); | ||||
|           ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>( | ||||
|               fake_quant_on_data, | ||||
|               running_min_data, | ||||
|               running_max_data, | ||||
|               qmin, | ||||
|               qmax, | ||||
|               1, // size | ||||
|               symmetric_quant, // preserve_sparsity | ||||
|               scale_ptr, | ||||
|               zp_ptr); | ||||
|         }); | ||||
|     float* running_min_data = running_min.data_ptr<float>(); | ||||
|     float* running_max_data = running_max.data_ptr<float>(); | ||||
|     ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>( | ||||
|         fake_quant_on_data, | ||||
|         running_min_data, | ||||
|         running_max_data, | ||||
|         qmin, | ||||
|         qmax, | ||||
|         1, // size | ||||
|         symmetric_quant, // preserve_sparsity | ||||
|         scale_ptr, | ||||
|         zp_ptr); | ||||
|     C10_CUDA_KERNEL_LAUNCH_CHECK(); | ||||
|   } | ||||
| } | ||||
|  | ||||
| @ -64,6 +64,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) { | ||||
|   // create sparse descriptor, dtype | ||||
|   cusparseLtMatDescriptor_t sparse_input_descriptor; | ||||
|   cudaDataType type; | ||||
|   auto compression_factor = 9; | ||||
|  | ||||
|   #ifdef USE_ROCM | ||||
|   TORCH_CHECK(isHipSparseLtSupported()); | ||||
| @ -72,6 +73,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) { | ||||
|   switch (sparse_input.scalar_type()) { | ||||
|     case at::ScalarType::Char: | ||||
|       type = CUDA_R_8I; | ||||
|       compression_factor = 10; | ||||
|       break; | ||||
|     case at::ScalarType::Half: | ||||
|       type = CUDA_R_16F; | ||||
| @ -87,6 +89,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) { | ||||
| #if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602 && !defined(USE_ROCM) | ||||
|     case at::ScalarType::Float8_e4m3fn: | ||||
|       type = CUDA_R_8F_E4M3; | ||||
|       compression_factor = 10; | ||||
|       break; | ||||
| #endif | ||||
|     default: | ||||
| @ -94,6 +97,10 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) { | ||||
|       break; | ||||
|   } | ||||
|  | ||||
|   // create a new compressed tensor with the same dtype as | ||||
|   auto compressed_tensor = | ||||
|       sparse_input.new_empty(sparse_input.numel() * compression_factor / 16); | ||||
|  | ||||
|   TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit( | ||||
|       &handle, | ||||
|       &sparse_input_descriptor, | ||||
| @ -114,15 +121,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) { | ||||
|       &compressed_size, | ||||
|       &compressed_buffer_size)); | ||||
|  | ||||
|   // create a new compressed tensor with the same dtype as the input, | ||||
|   // and with packed data/metadata stored in an array with original | ||||
|   // number of rows, and sufficient columns to provide compressed_size | ||||
|   // buffer (in bytes) | ||||
|   size_t orig_m = sparse_input.size(0); | ||||
|   size_t div = orig_m * sparse_input.itemsize(); | ||||
|   size_t new_n = (compressed_size + div - 1) / div; // floor | ||||
|   auto compressed_tensor = sparse_input.new_empty({(int64_t)orig_m, (int64_t)new_n}); | ||||
|  | ||||
|   auto& allocator = *::c10::cuda::CUDACachingAllocator::get(); | ||||
|   auto compressedBufferPtr = allocator.allocate(compressed_buffer_size); | ||||
|   cudaStream_t stream = at::cuda::getCurrentCUDAStream(); | ||||
| @ -167,6 +165,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl( | ||||
|   cudaDataType output_type; | ||||
|   cudaDataType C_type; | ||||
|   cusparseComputeType compute_type; | ||||
|   auto compression_factor = 9; | ||||
|  | ||||
|   #ifdef USE_ROCM | ||||
|   TORCH_CHECK(isHipSparseLtSupported()); | ||||
| @ -178,6 +177,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl( | ||||
|       output_type = CUDA_R_8I; | ||||
|       C_type = CUDA_R_8I; | ||||
|       compute_type = CUSPARSE_COMPUTE_32I; | ||||
|       compression_factor = 10; | ||||
|       break; | ||||
|  | ||||
| // cuSPARSELt v0.5.2 onwards changes CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUT_16F | ||||
| @ -210,6 +210,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl( | ||||
|       output_type = CUDA_R_8F_E4M3; | ||||
|       C_type = CUDA_R_16F; | ||||
|       compute_type = CUSPARSE_COMPUTE_32F; | ||||
|       compression_factor = 10; | ||||
|       break; | ||||
| #endif | ||||
| // cuSPARSELt <= v0.5.2 uses CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUTE_16F | ||||
| @ -299,10 +300,9 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl( | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   TORCH_INTERNAL_ASSERT(compressed_A.dim() == 2); // encoded M x S | ||||
|   int64_t k = dense_B.size(0); | ||||
|   int64_t n = dense_B.size(1); | ||||
|   int64_t m = compressed_A.size(0); | ||||
|   int64_t m = (compressed_A.numel() * 16 / compression_factor) / k; | ||||
|  | ||||
|   // initialize sparse descriptor | ||||
|   cusparseLtMatDescriptor_t sparse_input_descriptor; | ||||
|  | ||||
| @ -5,6 +5,51 @@ | ||||
|  | ||||
| #include <ATen/test/allocator_clone_test.h> | ||||
|  | ||||
| #include <torch/csrc/cuda/CUDAPluggableAllocator.h> | ||||
|  | ||||
| TEST(AllocatorTestCUDA, test_clone) { | ||||
|   test_allocator_clone(c10::cuda::CUDACachingAllocator::get()); | ||||
| } | ||||
|  | ||||
| static int called_dummy_free_0 = 0; | ||||
| static int called_dummy_free_1 = 0; | ||||
|  | ||||
| void* dummy_alloc_0(size_t size, int device, void* stream) {return nullptr;} | ||||
| void dummy_free_0(void* data, size_t size, int device, void* stream) { | ||||
|   called_dummy_free_0++; | ||||
| } | ||||
| void dummy_free_1(void* data, size_t size, int device, void* stream) { | ||||
|   called_dummy_free_1++; | ||||
| } | ||||
|  | ||||
| // Tests that data_ptrs have their respective deleters | ||||
| // when mixing allocators | ||||
| TEST(AllocatorTestCUDA, test_pluggable_allocator_deleters) { | ||||
|   // Create a tensor with dummy_allocator_0, where dummy_free_0 is the deleter | ||||
|   auto dummy_allocator_0 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_0); | ||||
|   c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_0.get()); | ||||
|   at::Tensor a = at::empty({0}, at::TensorOptions().device(at::kCUDA)); | ||||
|  | ||||
|   // Create a tensor with dummy_allocator_1, where dummy_free_1 is the deleter | ||||
|   auto dummy_allocator_1 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_1); | ||||
|   c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_1.get()); | ||||
|   at::Tensor b = at::empty({0}, at::TensorOptions().device(at::kCUDA)); | ||||
|  | ||||
|   // Manually use a's deleter | ||||
|   auto* ctx = a.storage().data_ptr().get_context(); | ||||
|   a.storage().data_ptr().get_deleter()(ctx); | ||||
|   a.storage().mutable_data_ptr().release_context(); | ||||
|  | ||||
|   // a's deleter is dummy_free_0 | ||||
|   // dummy_free_0 should be called above, so called_dummy_free_0 should be 1 | ||||
|   ASSERT_TRUE(called_dummy_free_0 == 1); | ||||
|  | ||||
|   // Manually use b's deleter | ||||
|   ctx = b.storage().data_ptr().get_context(); | ||||
|   b.storage().data_ptr().get_deleter()(ctx); | ||||
|   b.storage().mutable_data_ptr().release_context(); | ||||
|  | ||||
|   // b's deleter is dummy_free_1 | ||||
|   // dummy_free_1 should be called above, so called_dummy_free_1 should be 1 | ||||
|   ASSERT_TRUE(called_dummy_free_1 == 1); | ||||
| } | ||||
|  | ||||
| @ -42,7 +42,7 @@ TEST(MPSObjCInterfaceTest, MPSCustomKernel) { | ||||
|     id<MTLLibrary> customKernelLibrary = [device newLibraryWithSource: [NSString stringWithUTF8String:CUSTOM_KERNEL] | ||||
|                                                               options: nil | ||||
|                                                                 error: &error]; | ||||
|     TORCH_CHECK(customKernelLibrary, "Failed to create custom kernel library, error: ", error.localizedDescription.UTF8String); | ||||
|     TORCH_CHECK(customKernelLibrary, "Failed to to create custom kernel library, error: ", error.localizedDescription.UTF8String); | ||||
|  | ||||
|     id<MTLFunction> customFunction = [customKernelLibrary newFunctionWithName: @"add_arrays"]; | ||||
|     TORCH_CHECK(customFunction, "Failed to create function state object for the kernel"); | ||||
|  | ||||
| @ -76,23 +76,4 @@ int32_t getGlobalIdxFromDevice(DeviceIndex device) { | ||||
|   return device_global_idxs[device]; | ||||
| } | ||||
|  | ||||
| // Check if a device can access the memory of a peer device directly. | ||||
| bool canDeviceAccessPeer(DeviceIndex device, DeviceIndex peer) { | ||||
|   if (device == -1) { | ||||
|     device = c10::xpu::current_device(); | ||||
|   } | ||||
|   if (peer == -1) { | ||||
|     peer = c10::xpu::current_device(); | ||||
|   } | ||||
|   check_device_index(device); | ||||
|   check_device_index(peer); | ||||
|   // A device can always access itself | ||||
|   if (device == peer) { | ||||
|     return true; | ||||
|   } | ||||
|   return c10::xpu::get_raw_device(device).ext_oneapi_can_access_peer( | ||||
|       c10::xpu::get_raw_device(peer), | ||||
|       sycl::ext::oneapi::peer_access::access_supported); | ||||
| } | ||||
|  | ||||
| } // namespace at::xpu | ||||
|  | ||||
| @ -17,6 +17,4 @@ TORCH_XPU_API DeviceProp* getDeviceProperties(DeviceIndex device); | ||||
|  | ||||
| TORCH_XPU_API int32_t getGlobalIdxFromDevice(DeviceIndex device); | ||||
|  | ||||
| TORCH_XPU_API bool canDeviceAccessPeer(DeviceIndex device, DeviceIndex peer); | ||||
|  | ||||
| } // namespace at::xpu | ||||
|  | ||||
| @ -174,11 +174,11 @@ YituTechConvBert,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| meta-llama/Llama-3.2-1B,pass,0 | ||||
| meta-llama/Llama-3.2-1B,pass,5 | ||||
|  | ||||
|  | ||||
|  | ||||
| google/gemma-2-2b,pass,0 | ||||
| google/gemma-2-2b,pass,5 | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| openai/whisper-tiny,pass,0 | ||||
| openai/whisper-tiny,pass,6 | ||||
|  | ||||
|  | ||||
|  | ||||
| Qwen/Qwen3-0.6B,pass,0 | ||||
| Qwen/Qwen3-0.6B,pass,5 | ||||
|  | ||||
| 
 | 
| @ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_Reformer,pass,5 | ||||
| hf_Reformer,pass,8 | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_T5_generate,pass,7 | ||||
| hf_T5_generate,pass,11 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| 
 | 
| @ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_Reformer,pass,20 | ||||
| hf_Reformer,pass,25 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| 
 | 
| @ -170,15 +170,15 @@ YituTechConvBert,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| meta-llama/Llama-3.2-1B,fail_to_run,0 | ||||
| meta-llama/Llama-3.2-1B,fail_accuracy,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| google/gemma-2-2b,fail_to_run,0 | ||||
| google/gemma-2-2b,fail_accuracy,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| google/gemma-3-4b-it,fail_to_run,0 | ||||
| google/gemma-3-4b-it,fail_accuracy,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -186,4 +186,4 @@ openai/whisper-tiny,fail_to_run,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| Qwen/Qwen3-0.6B,fail_to_run,0 | ||||
| Qwen/Qwen3-0.6B,fail_accuracy,0 | ||||
|  | ||||
| 
 | 
| @ -138,7 +138,7 @@ hf_Bert_large,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_BigBird,pass,27 | ||||
| hf_BigBird,pass,25 | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -158,7 +158,7 @@ hf_Longformer,pass,4 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_Reformer,pass,5 | ||||
| hf_Reformer,pass,8 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| 
 | 
| @ -138,7 +138,7 @@ hf_Bert_large,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_BigBird,pass,27 | ||||
| hf_BigBird,pass,25 | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -158,7 +158,7 @@ hf_Longformer,pass,4 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_Reformer,pass,5 | ||||
| hf_Reformer,pass,8 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| 
 | 
| @ -138,7 +138,7 @@ hf_Bert_large,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_BigBird,pass,27 | ||||
| hf_BigBird,pass,25 | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -158,7 +158,7 @@ hf_Longformer,pass,4 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_Reformer,pass,5 | ||||
| hf_Reformer,pass,8 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| 
 | 
| @ -174,11 +174,11 @@ YituTechConvBert,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| meta-llama/Llama-3.2-1B,pass,0 | ||||
| meta-llama/Llama-3.2-1B,pass,5 | ||||
|  | ||||
|  | ||||
|  | ||||
| google/gemma-2-2b,pass,0 | ||||
| google/gemma-2-2b,pass,5 | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| openai/whisper-tiny,pass,0 | ||||
| openai/whisper-tiny,pass,6 | ||||
|  | ||||
|  | ||||
|  | ||||
| Qwen/Qwen3-0.6B,pass,0 | ||||
| Qwen/Qwen3-0.6B,pass,5 | ||||
|  | ||||
| 
 | 
| @ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_Reformer,pass,5 | ||||
| hf_Reformer,pass,8 | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_T5_generate,pass,7 | ||||
| hf_T5_generate,pass,11 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| 
 | 
| @ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_Reformer,pass,20 | ||||
| hf_Reformer,pass,25 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| 
 | 
| @ -122,7 +122,7 @@ hf_Bert_large,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_BigBird,pass,27 | ||||
| hf_BigBird,pass,25 | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -142,7 +142,7 @@ hf_Longformer,pass,4 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_Reformer,pass,5 | ||||
| hf_Reformer,pass,8 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| 
 | 
| @ -174,11 +174,11 @@ YituTechConvBert,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| meta-llama/Llama-3.2-1B,pass,0 | ||||
| meta-llama/Llama-3.2-1B,pass,5 | ||||
|  | ||||
|  | ||||
|  | ||||
| google/gemma-2-2b,pass,0 | ||||
| google/gemma-2-2b,pass,5 | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -186,8 +186,8 @@ google/gemma-3-4b-it,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| openai/whisper-tiny,pass,0 | ||||
| openai/whisper-tiny,pass,6 | ||||
|  | ||||
|  | ||||
|  | ||||
| Qwen/Qwen3-0.6B,pass,0 | ||||
| Qwen/Qwen3-0.6B,pass,5 | ||||
|  | ||||
| 
 | 
| @ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_Reformer,pass,5 | ||||
| hf_Reformer,pass,8 | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_T5_generate,pass,7 | ||||
| hf_T5_generate,pass,11 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| 
 | 
| @ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| hf_Reformer,pass,20 | ||||
| hf_Reformer,pass,25 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| 
 | 
| @ -174,11 +174,11 @@ YituTechConvBert,pass,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| meta-llama/Llama-3.2-1B,pass,0 | ||||
| meta-llama/Llama-3.2-1B,pass,5 | ||||
|  | ||||
|  | ||||
|  | ||||
| google/gemma-2-2b,pass,0 | ||||
| google/gemma-2-2b,pass,5 | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0 | ||||
|  | ||||
|  | ||||
|  | ||||
| openai/whisper-tiny,pass,0 | ||||
| openai/whisper-tiny,pass,6 | ||||
|  | ||||
|  | ||||
|  | ||||
| Qwen/Qwen3-0.6B,pass,0 | ||||
| Qwen/Qwen3-0.6B,pass,5 | ||||
|  | ||||
| 
 | 
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user
	