mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-30 03:34:56 +08:00
Compare commits
3 Commits
feature/ju
...
disp_count
| Author | SHA1 | Date | |
|---|---|---|---|
| e4de72ea5d | |||
| d9258fb366 | |||
| ef78f99412 |
@ -31,7 +31,8 @@ pip install -r /pytorch/requirements.txt
|
||||
pip install auditwheel==6.2.0 wheel
|
||||
if [ "$DESIRED_CUDA" = "cpu" ]; then
|
||||
echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
|
||||
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
|
||||
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
|
||||
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
|
||||
else
|
||||
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
|
||||
export USE_SYSTEM_NCCL=1
|
||||
@ -45,5 +46,6 @@ else
|
||||
export USE_NVIDIA_PYPI_LIBS=1
|
||||
fi
|
||||
|
||||
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
|
||||
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
|
||||
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
|
||||
fi
|
||||
|
||||
@ -317,7 +317,7 @@ if __name__ == "__main__":
|
||||
).decode()
|
||||
|
||||
print("Building PyTorch wheel")
|
||||
build_vars = ""
|
||||
build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
|
||||
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
|
||||
if enable_cuda:
|
||||
build_vars += "MAX_JOBS=5 "
|
||||
|
||||
@ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
|
||||
try:
|
||||
with socket.create_connection((addr, port), timeout=timeout):
|
||||
return
|
||||
except (ConnectionRefusedError, TimeoutError): # noqa: PERF203
|
||||
except (ConnectionRefusedError, socket.timeout): # noqa: PERF203
|
||||
if i == attempt_cnt - 1:
|
||||
raise
|
||||
time.sleep(timeout)
|
||||
|
||||
@ -262,10 +262,13 @@ case "$tag" in
|
||||
TRITON_CPU=yes
|
||||
;;
|
||||
pytorch-linux-jammy-linter)
|
||||
PYTHON_VERSION=3.10
|
||||
# TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
|
||||
# We will need to update mypy version eventually, but that's for another day. The task
|
||||
# would be to upgrade mypy to 1.0.0 with Python 3.11
|
||||
PYTHON_VERSION=3.9
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter)
|
||||
PYTHON_VERSION=3.10
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
|
||||
PYTHON_VERSION=3.9
|
||||
CUDA_VERSION=12.8.1
|
||||
;;
|
||||
pytorch-linux-jammy-aarch64-py3.10-gcc11)
|
||||
|
||||
@ -1 +1 @@
|
||||
e0dda9059d082537cee36be6c5e4fe3b18c880c0
|
||||
56392aa978594cc155fa8af48cd949f5b5f1823a
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
transformers==4.56.0
|
||||
transformers==4.54.0
|
||||
soxr==0.5.0
|
||||
|
||||
@ -1 +1 @@
|
||||
bbb06c0334a6772b92d24bde54956e675c8c6604
|
||||
5ae38bdb0dc066c5823e34dc9797afb9de42c866
|
||||
|
||||
@ -42,27 +42,22 @@ install_pip_dependencies() {
|
||||
# A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current
|
||||
# numba and scipy version used in PyTorch CI
|
||||
conda_run pip uninstall -y numba scipy
|
||||
# Yaspin is needed for running CI test (get_benchmark_analysis_data.py)
|
||||
pip_install yaspin==3.1.0
|
||||
|
||||
popd
|
||||
}
|
||||
|
||||
setup_executorch() {
|
||||
pushd executorch
|
||||
|
||||
export PYTHON_EXECUTABLE=python
|
||||
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON"
|
||||
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
|
||||
|
||||
as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
|
||||
popd
|
||||
}
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
clone_executorch
|
||||
install_buck2
|
||||
install_conda_dependencies
|
||||
install_pip_dependencies
|
||||
pushd executorch
|
||||
setup_executorch
|
||||
popd
|
||||
else
|
||||
"$@"
|
||||
fi
|
||||
clone_executorch
|
||||
install_buck2
|
||||
install_conda_dependencies
|
||||
install_pip_dependencies
|
||||
setup_executorch
|
||||
|
||||
@ -93,9 +93,8 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
mypy==1.16.0 ; platform_system != "Windows"
|
||||
mypy==1.16.0
|
||||
# Pin MyPy version because new errors are likely to appear with each release
|
||||
# Skip on Windows as lots of type annotations are POSIX specific
|
||||
#Description: linter
|
||||
#Pinned versions: 1.16.0
|
||||
#test that import: test_typing.py, test_type_hints.py
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
sphinx==5.3.0
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 5.3.0
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
|
||||
|
||||
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
|
||||
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
|
||||
|
||||
@ -41,6 +41,7 @@ def sample_vllm_test_library():
|
||||
"pytest -v -s basic_correctness/test_cumem.py",
|
||||
"pytest -v -s basic_correctness/test_basic_correctness.py",
|
||||
"pytest -v -s basic_correctness/test_cpu_offload.py",
|
||||
"VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
|
||||
],
|
||||
},
|
||||
"vllm_basic_models_test": {
|
||||
@ -67,12 +68,15 @@ def sample_vllm_test_library():
|
||||
"-v",
|
||||
"-s",
|
||||
"entrypoints/llm",
|
||||
"--ignore=entrypoints/llm/test_lazy_outlines.py",
|
||||
"--ignore=entrypoints/llm/test_generate.py",
|
||||
"--ignore=entrypoints/llm/test_generate_multiple_loras.py",
|
||||
"--ignore=entrypoints/llm/test_collective_rpc.py",
|
||||
]
|
||||
),
|
||||
"pytest -v -s entrypoints/llm/test_generate.py",
|
||||
"pytest -v -s entrypoints/offline_mode",
|
||||
"pytest -v -s entrypoints/llm/test_lazy_outlines.py",
|
||||
"pytest -v -s entrypoints/llm/test_generate.py ",
|
||||
"VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
|
||||
],
|
||||
},
|
||||
"vllm_regression_test": {
|
||||
|
||||
40
.ci/pytorch/functorch_doc_push_script.sh
Executable file
40
.ci/pytorch/functorch_doc_push_script.sh
Executable file
@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This is where the local pytorch install in the docker image is located
|
||||
pt_checkout="/var/lib/jenkins/workspace"
|
||||
source "$pt_checkout/.ci/pytorch/common_utils.sh"
|
||||
echo "functorch_doc_push_script.sh: Invoked with $*"
|
||||
|
||||
set -ex -o pipefail
|
||||
|
||||
version=${DOCS_VERSION:-nightly}
|
||||
echo "version: $version"
|
||||
|
||||
# Build functorch docs
|
||||
pushd $pt_checkout/functorch/docs
|
||||
make html
|
||||
popd
|
||||
|
||||
git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
|
||||
pushd functorch_ghpages
|
||||
|
||||
if [ "$version" == "main" ]; then
|
||||
version=nightly
|
||||
fi
|
||||
|
||||
git rm -rf "$version" || true
|
||||
mv "$pt_checkout/functorch/docs/build/html" "$version"
|
||||
|
||||
git add "$version" || true
|
||||
git status
|
||||
git config user.email "soumith+bot@pytorch.org"
|
||||
git config user.name "pytorchbot"
|
||||
# If there aren't changes, don't make a commit; push is no-op
|
||||
git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
|
||||
git status
|
||||
|
||||
if [[ "${WITH_PUSH:-}" == true ]]; then
|
||||
git push -u origin gh-pages
|
||||
fi
|
||||
|
||||
popd
|
||||
@ -1,25 +0,0 @@
|
||||
From 6e08c9d08e9de59c7af28b720289debbbd384764 Mon Sep 17 00:00:00 2001
|
||||
From: Michael Wang <13521008+isVoid@users.noreply.github.com>
|
||||
Date: Tue, 1 Apr 2025 17:28:05 -0700
|
||||
Subject: [PATCH] Avoid bumping certain driver API to avoid future breakage
|
||||
(#185)
|
||||
|
||||
Co-authored-by: isVoid <isVoid@users.noreply.github.com>
|
||||
---
|
||||
numba_cuda/numba/cuda/cudadrv/driver.py | 3 +++
|
||||
1 file changed, 3 insertions(+)
|
||||
|
||||
diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
|
||||
index 1641bf77..233e9ed7 100644
|
||||
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
|
||||
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
|
||||
@@ -365,6 +365,9 @@ def _find_api(self, fname):
|
||||
else:
|
||||
variants = ('_v2', '')
|
||||
|
||||
+ if fname in ("cuCtxGetDevice", "cuCtxSynchronize"):
|
||||
+ return getattr(self.lib, fname)
|
||||
+
|
||||
for variant in variants:
|
||||
try:
|
||||
return getattr(self.lib, f'{fname}{variant}')
|
||||
@ -32,16 +32,6 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v
|
||||
git config --global --add safe.directory /var/lib/jenkins/workspace
|
||||
fi
|
||||
|
||||
|
||||
# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
|
||||
NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
|
||||
if [ -n "$NUMBA_CUDA_DIR" ]; then
|
||||
NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
|
||||
pushd "$NUMBA_CUDA_DIR"
|
||||
patch -p4 <"$NUMBA_PATCH"
|
||||
popd
|
||||
fi
|
||||
|
||||
echo "Environment variables:"
|
||||
env
|
||||
|
||||
@ -334,17 +324,11 @@ test_python() {
|
||||
}
|
||||
|
||||
test_python_smoke() {
|
||||
# Smoke tests for H100/B200
|
||||
# Smoke tests for H100
|
||||
time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_python_smoke_b200() {
|
||||
# Targeted smoke tests for B200 - staged approach to avoid too many failures
|
||||
time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_h100_distributed() {
|
||||
# Distributed tests at H100
|
||||
time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
@ -1556,10 +1540,14 @@ test_executorch() {
|
||||
install_torchvision
|
||||
install_torchaudio
|
||||
|
||||
INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh"
|
||||
|
||||
pushd /executorch
|
||||
"${INSTALL_SCRIPT}" setup_executorch
|
||||
|
||||
export PYTHON_EXECUTABLE=python
|
||||
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
|
||||
|
||||
# NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
|
||||
# from the PR
|
||||
bash .ci/scripts/setup-linux.sh --build-tool cmake
|
||||
|
||||
echo "Run ExecuTorch unit tests"
|
||||
pytest -v -n auto
|
||||
@ -1573,6 +1561,10 @@ test_executorch() {
|
||||
|
||||
popd
|
||||
|
||||
# Test torchgen generated code for Executorch.
|
||||
echo "Testing ExecuTorch op registration"
|
||||
"$BUILD_BIN_DIR"/test_edge_op_registration
|
||||
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
@ -1580,7 +1572,6 @@ test_linux_aarch64() {
|
||||
python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
|
||||
test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
|
||||
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
|
||||
distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
|
||||
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
|
||||
|
||||
# Dynamo tests
|
||||
@ -1779,8 +1770,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
|
||||
test_xpu_bin
|
||||
elif [[ "${TEST_CONFIG}" == smoke ]]; then
|
||||
test_python_smoke
|
||||
elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then
|
||||
test_python_smoke_b200
|
||||
elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
|
||||
test_h100_distributed
|
||||
elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
|
||||
|
||||
@ -137,7 +137,7 @@ sccache --show-stats
|
||||
python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
|
||||
(
|
||||
if "%BUILD_ENVIRONMENT%"=="" (
|
||||
echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash.
|
||||
echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
|
||||
) else (
|
||||
copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%"
|
||||
|
||||
|
||||
@ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" (
|
||||
) else (
|
||||
set CONDA_PARENT_DIR=C:\Jenkins
|
||||
)
|
||||
set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3
|
||||
|
||||
|
||||
:: Be conservative here when rolling out the new AMI with conda. This will try
|
||||
:: to install conda as before if it couldn't find the conda installation. This
|
||||
:: can be removed eventually after we gain enough confidence in the AMI
|
||||
if not exist %CONDA_ROOT_DIR% (
|
||||
if not exist %CONDA_PARENT_DIR%\Miniconda3 (
|
||||
set INSTALL_FRESH_CONDA=1
|
||||
)
|
||||
|
||||
@ -17,14 +17,10 @@ if "%INSTALL_FRESH_CONDA%"=="1" (
|
||||
if errorlevel 1 exit /b
|
||||
if not errorlevel 0 exit /b
|
||||
|
||||
%TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR%
|
||||
%TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
|
||||
if errorlevel 1 exit /b
|
||||
if not errorlevel 0 exit /b
|
||||
)
|
||||
|
||||
:: Activate conda so that we can use its commands, i.e. conda, python, pip
|
||||
call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%
|
||||
:: Activate conda so that we can use its commands, i.e. conda, python, pip
|
||||
call conda activate py_tmp
|
||||
|
||||
call pip install -r .ci/docker/requirements-ci.txt
|
||||
call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
|
||||
|
||||
@ -14,7 +14,7 @@ if not errorlevel 0 exit /b
|
||||
:: build\torch. Rather than changing all these references, making a copy of torch folder
|
||||
:: from conda to the current workspace is easier. The workspace will be cleaned up after
|
||||
:: the job anyway
|
||||
xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
|
||||
xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
|
||||
|
||||
pushd .
|
||||
if "%VC_VERSION%" == "" (
|
||||
|
||||
@ -38,14 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
|
||||
fi
|
||||
|
||||
# TODO: Move both of them to Windows AMI
|
||||
python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
|
||||
|
||||
# Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments
|
||||
# pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node'
|
||||
# scipy from 1.6.3 to 1.10
|
||||
# expecttest from 0.1.3 to 0.3.0
|
||||
# xdoctest from 1.0.2 to 1.3.0
|
||||
python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.3.0" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" "boto3==1.35.42"
|
||||
python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
|
||||
|
||||
# Install Z3 optional dependency for Windows builds.
|
||||
python -m pip install z3-solver==4.15.1.0
|
||||
@ -59,6 +52,9 @@ python -m pip install parameterized==0.8.1
|
||||
# Install pulp for testing ilps under torch\distributed\_tools
|
||||
python -m pip install pulp==2.9.0
|
||||
|
||||
# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
|
||||
python -m pip install expecttest==0.3.0
|
||||
|
||||
run_tests() {
|
||||
# Run nvidia-smi if available
|
||||
for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
|
||||
|
||||
@ -264,7 +264,7 @@ def unzip_artifact_and_replace_files() -> None:
|
||||
change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py")
|
||||
|
||||
for file in Path(f"artifacts/dist/{old_stem}").glob(
|
||||
"*.dist-info/*",
|
||||
"*.dist-info/**",
|
||||
):
|
||||
change_content_to_new_version(file)
|
||||
|
||||
|
||||
16
.github/actions/setup-win/action.yml
vendored
16
.github/actions/setup-win/action.yml
vendored
@ -6,12 +6,6 @@ inputs:
|
||||
cuda-version:
|
||||
description: which cuda version to install, 'cpu' for none
|
||||
required: true
|
||||
python-version:
|
||||
required: false
|
||||
type: string
|
||||
default: "3.10"
|
||||
description: |
|
||||
The python version to be used. Will be 3.10 by default
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
@ -44,24 +38,18 @@ runs:
|
||||
CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat"
|
||||
|
||||
{
|
||||
echo "CONDA=${CONDA}";
|
||||
echo "CONDA_RUN=${CONDA} run --no-capture-output";
|
||||
echo "CONDA_BUILD=${CONDA} run conda-build";
|
||||
echo "CONDA_INSTALL=${CONDA} install";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Setup Python3
|
||||
env:
|
||||
PYTHON_VERSION: ${{ inputs.python-version }}
|
||||
shell: bash
|
||||
run: |
|
||||
set +e
|
||||
set -x
|
||||
|
||||
# Create new py_tmp env with python-version
|
||||
${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp
|
||||
|
||||
PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
|
||||
PYTHON3=$(${CONDA_RUN} which python3)
|
||||
EXIT_CODE=$?
|
||||
|
||||
if [[ "${EXIT_CODE}" == "0" ]]; then
|
||||
@ -74,7 +62,7 @@ runs:
|
||||
# installation, which is Python 3 based. Its Python is default to Python 3. Further, there
|
||||
# is also the Miniconda installation that is Python 2 based, and both can be installed if
|
||||
# needed. In both cases, Python binary is just called python
|
||||
PYTHON=$(${CONDA_RUN} -n py_tmp which python)
|
||||
PYTHON=$(${CONDA_RUN} which python)
|
||||
EXIT_CODE=$?
|
||||
|
||||
if [[ "${EXIT_CODE}" == "0" ]]; then
|
||||
|
||||
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
090197034faf3b193c4467cedeb9281e3078892d
|
||||
5bcc153d7bf69ef34bc5788a33f60f1792cf2861
|
||||
|
||||
3
.github/labeler.yml
vendored
3
.github/labeler.yml
vendored
@ -130,6 +130,3 @@
|
||||
- torch/csrc/inductor/aoti_include/**
|
||||
- torchgen/aoti/**
|
||||
- torchgen/gen_aoti_c_shim.py
|
||||
|
||||
"ciflow/vllm":
|
||||
- .github/ci_commit_pins/vllm.txt
|
||||
|
||||
1
.github/pytorch-probot.yml
vendored
1
.github/pytorch-probot.yml
vendored
@ -36,7 +36,6 @@ ciflow_push_tags:
|
||||
- ciflow/win-arm64
|
||||
- ciflow/h100-symm-mem
|
||||
- ciflow/h100-cutlass-backend
|
||||
- ciflow/b200
|
||||
retryable_workflows:
|
||||
- pull
|
||||
- trunk
|
||||
|
||||
2
.github/scripts/generate_ci_workflows.py
vendored
2
.github/scripts/generate_ci_workflows.py
vendored
@ -135,7 +135,7 @@ ROCM_SMOKE_WORKFLOWS = [
|
||||
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
|
||||
OperatingSystem.LINUX,
|
||||
arches=["6.4"],
|
||||
python_versions=["3.10"],
|
||||
python_versions=["3.9"],
|
||||
),
|
||||
ciflow_config=CIFlowConfig(
|
||||
labels={
|
||||
|
||||
2
.github/workflows/_binary-test-linux.yml
vendored
2
.github/workflows/_binary-test-linux.yml
vendored
@ -187,6 +187,8 @@ jobs:
|
||||
|
||||
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
|
||||
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
|
||||
with:
|
||||
driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }}
|
||||
if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}
|
||||
|
||||
- name: configure aws credentials
|
||||
|
||||
14
.github/workflows/_docs.yml
vendored
14
.github/workflows/_docs.yml
vendored
@ -75,6 +75,10 @@ jobs:
|
||||
runner: ${{ inputs.runner_prefix }}linux.2xlarge
|
||||
# It takes less than 30m to finish python docs unless there are issues
|
||||
timeout-minutes: 30
|
||||
- docs_type: functorch
|
||||
runner: ${{ inputs.runner_prefix }}linux.2xlarge
|
||||
# It takes less than 15m to finish functorch docs unless there are issues
|
||||
timeout-minutes: 15
|
||||
# Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
|
||||
# The current name requires updating the database last docs push query from test-infra every time the matrix is updated
|
||||
name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
|
||||
@ -207,6 +211,16 @@ jobs:
|
||||
path: cppdocs/
|
||||
s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs
|
||||
|
||||
- name: Upload functorch Docs Preview
|
||||
uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
|
||||
if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }}
|
||||
with:
|
||||
retention-days: 14
|
||||
s3-bucket: doc-previews
|
||||
if-no-files-found: error
|
||||
path: functorch_ghpages/nightly/
|
||||
s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs
|
||||
|
||||
- name: Teardown Linux
|
||||
uses: pytorch/test-infra/.github/actions/teardown-linux@main
|
||||
if: always()
|
||||
|
||||
28
.github/workflows/_get-changed-files.yml
vendored
28
.github/workflows/_get-changed-files.yml
vendored
@ -2,12 +2,6 @@ name: Get Changed Files
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
all_files:
|
||||
description: "Whether to return all files instead of just changed files"
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
outputs:
|
||||
changed-files:
|
||||
description: "List of changed files (space-separated) or '*' if not in a PR"
|
||||
@ -32,23 +26,17 @@ jobs:
|
||||
# Get the PR number from the github context
|
||||
PR_NUMBER="${{ github.event.number }}"
|
||||
|
||||
# Check if all_files is requested
|
||||
if [ "${{ inputs.all_files }}" = "true" ]; then
|
||||
echo "all_files input is true, returning all files"
|
||||
echo "changed-files=*" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
# Use gh CLI to get changed files in the PR with explicit repo
|
||||
CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
|
||||
# Use gh CLI to get changed files in the PR with explicit repo
|
||||
CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
|
||||
|
||||
if [ -z "$CHANGED_FILES" ]; then
|
||||
echo "No changed files found, setting to '*'"
|
||||
CHANGED_FILES="*"
|
||||
fi
|
||||
|
||||
echo "Changed files: $CHANGED_FILES"
|
||||
echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
|
||||
if [ -z "$CHANGED_FILES" ]; then
|
||||
echo "No changed files found, setting to '*'"
|
||||
CHANGED_FILES="*"
|
||||
fi
|
||||
|
||||
echo "Changed files: $CHANGED_FILES"
|
||||
echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
|
||||
|
||||
else
|
||||
echo "Not in PR context, setting changed files to '*'"
|
||||
echo "changed-files=*" >> "$GITHUB_OUTPUT"
|
||||
|
||||
2
.github/workflows/_linux-test.yml
vendored
2
.github/workflows/_linux-test.yml
vendored
@ -169,7 +169,7 @@ jobs:
|
||||
id: install-nvidia-driver
|
||||
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
|
||||
with:
|
||||
driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }}
|
||||
driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
|
||||
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}
|
||||
|
||||
- name: Setup GPU_FLAG for docker run
|
||||
|
||||
2
.github/workflows/_win-build.yml
vendored
2
.github/workflows/_win-build.yml
vendored
@ -151,7 +151,7 @@ jobs:
|
||||
BUILD_WHEEL: 1
|
||||
MAX_JOBS: 8
|
||||
CUDA_VERSION: ${{ inputs.cuda-version }}
|
||||
PYTHON_VERSION: "3.10"
|
||||
PYTHON_VERSION: "3.9"
|
||||
SCCACHE_BUCKET: "ossci-compiler-cache"
|
||||
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
|
||||
SCCACHE_REGION: us-east-1
|
||||
|
||||
2
.github/workflows/_win-test.yml
vendored
2
.github/workflows/_win-test.yml
vendored
@ -184,7 +184,7 @@ jobs:
|
||||
env:
|
||||
USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
|
||||
INSTALL_WINDOWS_SDK: 1
|
||||
PYTHON_VERSION: "3.10"
|
||||
PYTHON_VERSION: 3.9
|
||||
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
|
||||
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
|
||||
TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
|
||||
|
||||
2
.github/workflows/build-vllm-wheel.yml
vendored
2
.github/workflows/build-vllm-wheel.yml
vendored
@ -178,7 +178,7 @@ jobs:
|
||||
contents: read
|
||||
container:
|
||||
image: continuumio/miniconda3:4.12.0
|
||||
environment: ${{ ((github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && 'nightly-wheel-upload' || '' }}
|
||||
environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }}
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
|
||||
5
.github/workflows/docker-builds.yml
vendored
5
.github/workflows/docker-builds.yml
vendored
@ -70,8 +70,9 @@ jobs:
|
||||
pytorch-linux-jammy-py3-clang18-asan,
|
||||
pytorch-linux-jammy-py3-clang12-onnx,
|
||||
pytorch-linux-jammy-linter,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter,
|
||||
pytorch-linux-jammy-py3-clang12-executorch,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
|
||||
# Executorch pin needs update
|
||||
# pytorch-linux-jammy-py3-clang12-executorch,
|
||||
pytorch-linux-jammy-py3.12-triton-cpu,
|
||||
pytorch-linux-noble-riscv64-py3.12-gcc14
|
||||
]
|
||||
|
||||
14
.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
generated
vendored
14
.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
generated
vendored
@ -44,7 +44,7 @@ jobs:
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
manywheel-py3_10-rocm6_4-build:
|
||||
manywheel-py3_9-rocm6_4-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
@ -58,16 +58,16 @@ jobs:
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
|
||||
DESIRED_PYTHON: "3.10"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-rocm6_4
|
||||
build_name: manywheel-py3_9-rocm6_4
|
||||
build_environment: linux-binary-manywheel-rocm
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-rocm6_4-test: # Testing
|
||||
manywheel-py3_9-rocm6_4-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_10-rocm6_4-build
|
||||
- manywheel-py3_9-rocm6_4-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
@ -82,14 +82,14 @@ jobs:
|
||||
SKIP_ALL_TESTS: 1
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
|
||||
DESIRED_PYTHON: "3.10"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Setup ROCm
|
||||
uses: ./.github/actions/setup-rocm
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: manywheel-py3_10-rocm6_4
|
||||
name: manywheel-py3_9-rocm6_4
|
||||
path: "${{ runner.temp }}/artifacts/"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
|
||||
8
.github/workflows/lint.yml
vendored
8
.github/workflows/lint.yml
vendored
@ -31,8 +31,6 @@ jobs:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: Get changed files
|
||||
uses: ./.github/workflows/_get-changed-files.yml
|
||||
with:
|
||||
all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') }}
|
||||
|
||||
lintrunner-clang:
|
||||
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
|
||||
@ -55,7 +53,7 @@ jobs:
|
||||
with:
|
||||
timeout: 120
|
||||
runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
|
||||
docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter
|
||||
docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
|
||||
# NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
|
||||
# to run git rev-parse HEAD~:.ci/docker when a new image is needed
|
||||
fetch-depth: 0
|
||||
@ -266,10 +264,10 @@ jobs:
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 1
|
||||
- name: Setup Python 3.10
|
||||
- name: Setup Python 3.9
|
||||
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
|
||||
with:
|
||||
python-version: '3.10'
|
||||
python-version: '3.9'
|
||||
architecture: x64
|
||||
cache: pip
|
||||
- name: Install dependencies
|
||||
|
||||
26
.github/workflows/pull.yml
vendored
26
.github/workflows/pull.yml
vendored
@ -316,6 +316,32 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3-clang12-executorch-build:
|
||||
if: false # Docker build needs pin update
|
||||
name: linux-jammy-py3-clang12-executorch
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3-clang12-executorch
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3-clang12-executorch-test:
|
||||
name: linux-jammy-py3-clang12-executorch
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-py3-clang12-executorch-build
|
||||
if: false # Has been broken for a while
|
||||
with:
|
||||
build-environment: linux-jammy-py3-clang12-executorch
|
||||
docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
|
||||
name: cuda12.8-py3.10-gcc9-sm75
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
||||
76
.github/workflows/test-b200.yml
vendored
76
.github/workflows/test-b200.yml
vendored
@ -1,76 +0,0 @@
|
||||
# B200 Smoke Tests CI Workflow
|
||||
#
|
||||
# This workflow runs smoke tests on B200 hardware
|
||||
#
|
||||
# Flow:
|
||||
# 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200
|
||||
# 2. Runs smoke tests on linux.dgx.b200 runner
|
||||
# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function
|
||||
#
|
||||
# Triggered by:
|
||||
# - Pull requests modifying this workflow file
|
||||
# - Manual dispatch
|
||||
# - Schedule (every 6 hours)
|
||||
# - Adding ciflow/b200 label to a PR (creates ciflow/b200/* tag)
|
||||
|
||||
name: B200 Smoke Tests
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/test-b200.yml
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: 0 4,10,16,22 * * * # every 6 hours
|
||||
push:
|
||||
tags:
|
||||
- ciflow/b200/*
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
|
||||
get-label-type:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-sm100-build:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '10.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "smoke_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
|
||||
]}
|
||||
# config: "smoke_b200" maps to test_python_smoke_b200() in .ci/pytorch/test.sh
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-cuda12_8-py3_10-gcc11-sm100-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
|
||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
secrets: inherit
|
||||
24
.github/workflows/trunk.yml
vendored
24
.github/workflows/trunk.yml
vendored
@ -259,27 +259,3 @@ jobs:
|
||||
docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3-clang12-executorch-build:
|
||||
name: linux-jammy-py3-clang12-executorch
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3-clang12-executorch
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3-clang12-executorch-test:
|
||||
name: linux-jammy-py3-clang12-executorch
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-py3-clang12-executorch-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3-clang12-executorch
|
||||
docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
24
.github/workflows/unstable.yml
vendored
24
.github/workflows/unstable.yml
vendored
@ -53,3 +53,27 @@ jobs:
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-py3_9-clang9-xla-build:
|
||||
name: linux-jammy-py3_9-clang9-xla
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-clang9-xla
|
||||
docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3_9-clang9-xla-test:
|
||||
name: linux-jammy-py3_9-clang9-xla
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-py3_9-clang9-xla-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-clang9-xla
|
||||
docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
2
.github/workflows/vllm.yml
vendored
2
.github/workflows/vllm.yml
vendored
@ -36,8 +36,6 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# When building vLLM, uv doesn't like that we rename wheel without changing the wheel metadata
|
||||
allow-reuse-old-whl: false
|
||||
build-additional-packages: "vision audio"
|
||||
build-external-packages: "vllm"
|
||||
build-environment: linux-jammy-cuda12.8-py3.12-gcc11
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@ -259,9 +259,6 @@ gen
|
||||
.pytest_cache
|
||||
aten/build/*
|
||||
|
||||
# Linker scripts for prioritized text optimization
|
||||
cmake/linker_script.ld
|
||||
|
||||
# Bram
|
||||
plsdontbreak
|
||||
|
||||
|
||||
@ -123,7 +123,6 @@ is_formatter = true
|
||||
code = 'MYPY'
|
||||
include_patterns = [
|
||||
'setup.py',
|
||||
'functorch/dim/**/*.py',
|
||||
'torch/**/*.py',
|
||||
'torch/**/*.pyi',
|
||||
'caffe2/**/*.py',
|
||||
@ -196,7 +195,6 @@ exclude_patterns = [
|
||||
'tools/test/gen_operators_yaml_test.py',
|
||||
'tools/test/gen_oplist_test.py',
|
||||
'tools/test/test_selective_build.py',
|
||||
'tools/experimental/dynamic_shapes/torchfuzz/**',
|
||||
]
|
||||
command = [
|
||||
'python3',
|
||||
@ -966,6 +964,7 @@ exclude_patterns = [
|
||||
'test/jit/**', # should be run through test/test_jit.py
|
||||
'test/ao/sparsity/**', # should be run through test/test_ao_sparsity.py
|
||||
'test/fx/**', # should be run through test/test_fx.py
|
||||
'test/bottleneck_test/**', # excluded by test/run_test.py
|
||||
'test/package/**', # excluded by test/run_test.py
|
||||
'test/distributed/argparse_util_test.py',
|
||||
'test/distributed/bin/test_script.py',
|
||||
@ -1411,6 +1410,8 @@ exclude_patterns = [
|
||||
'torch/utils/benchmark/utils/timer.py',
|
||||
'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
|
||||
'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
|
||||
'torch/utils/bottleneck/__init__.py',
|
||||
'torch/utils/bottleneck/__main__.py',
|
||||
'torch/utils/bundled_inputs.py',
|
||||
'torch/utils/checkpoint.py',
|
||||
'torch/utils/collect_env.py',
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
|
||||
# cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW)
|
||||
|
||||
# Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this
|
||||
# sometimes makes XCode C compiler gets detected as "Clang", even when the C++
|
||||
@ -379,13 +380,6 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
|
||||
OFF "USE_CUDA" OFF)
|
||||
cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
|
||||
"CPU_AARCH64" OFF)
|
||||
# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le.
|
||||
set(USE_PRIORITIZED_TEXT_DEFAULT OFF)
|
||||
if(LINUX AND CPU_AARCH64)
|
||||
set(USE_PRIORITIZED_TEXT_DEFAULT ON)
|
||||
endif()
|
||||
cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld."
|
||||
"${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF)
|
||||
|
||||
option(USE_MIMALLOC "Use mimalloc" OFF)
|
||||
# Enable third party mimalloc library to improve memory allocation performance
|
||||
@ -663,11 +657,6 @@ endif(MSVC)
|
||||
|
||||
string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
|
||||
|
||||
# Set linker max-page-size to 64KiB on AArch64 Linux
|
||||
if(LINUX AND CPU_AARCH64)
|
||||
add_link_options_if_supported("-z,max-page-size=0x10000")
|
||||
endif()
|
||||
|
||||
# Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
|
||||
# applicable to mobile are disabled by this variable. Setting
|
||||
# `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
|
||||
@ -902,7 +891,7 @@ IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
|
||||
endif()
|
||||
|
||||
# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100.
|
||||
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
|
||||
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
||||
message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a")
|
||||
set(USE_FBGEMM_GENAI ON)
|
||||
endif()
|
||||
@ -1432,57 +1421,3 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
|
||||
install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
|
||||
DESTINATION "${CMAKE_INSTALL_BINDIR}")
|
||||
endif()
|
||||
|
||||
if(USE_PRIORITIZED_TEXT_FOR_LD)
|
||||
add_compile_options(
|
||||
$<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
|
||||
$<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
|
||||
)
|
||||
set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
|
||||
set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
|
||||
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
|
||||
COMMENT "Generating prioritized text linker files"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
|
||||
|
||||
if(BUILD_PYTHON)
|
||||
set(LINKER_OPT_TARGETS torch_python)
|
||||
endif()
|
||||
|
||||
if(NOT BUILD_LIBTORCHLESS)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
|
||||
if(USE_CUDA)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
|
||||
endif()
|
||||
if(USE_XPU)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
|
||||
endif()
|
||||
if(USE_ROCM)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
foreach(tgt IN LISTS LINKER_OPT_TARGETS)
|
||||
if(TARGET ${tgt})
|
||||
add_dependencies("${tgt}" generate_linker_script)
|
||||
target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
|
||||
set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
|
||||
else()
|
||||
message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
else()
|
||||
if(LINUX AND CPU_AARCH64)
|
||||
message(WARNING [[
|
||||
It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
|
||||
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
|
||||
]])
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@ -317,20 +317,10 @@ IF(USE_FBGEMM_GENAI)
|
||||
-greedy-reverse-local-assignment=1
|
||||
-fhip-new-launch-api)
|
||||
|
||||
# Only compile for gfx942 for now.
|
||||
# This is rather hacky, I could not figure out a clean solution :(
|
||||
set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
|
||||
string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
|
||||
if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
|
||||
list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
|
||||
endif()
|
||||
set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
|
||||
|
||||
hip_add_library(
|
||||
fbgemm_genai STATIC
|
||||
${fbgemm_genai_native_rocm_hip}
|
||||
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
|
||||
set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
|
||||
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
|
||||
|
||||
|
||||
@ -65,24 +65,14 @@ DLDataType getDLDataType(const Tensor& t) {
|
||||
break;
|
||||
// TODO(#146647): use macro here instead of spelling out each shell dtype
|
||||
case ScalarType::Float8_e5m2:
|
||||
dtype.code = DLDataTypeCode::kDLFloat8_e5m2;
|
||||
break;
|
||||
case ScalarType::Float8_e5m2fnuz:
|
||||
dtype.code = DLDataTypeCode::kDLFloat8_e5m2fnuz;
|
||||
break;
|
||||
case ScalarType::Float8_e4m3fn:
|
||||
dtype.code = DLDataTypeCode::kDLFloat8_e4m3fn;
|
||||
break;
|
||||
case ScalarType::Float8_e4m3fnuz:
|
||||
dtype.code = DLDataTypeCode::kDLFloat8_e4m3fnuz;
|
||||
break;
|
||||
case ScalarType::Float8_e8m0fnu:
|
||||
dtype.code = DLDataTypeCode::kDLFloat8_e8m0fnu;
|
||||
TORCH_CHECK_BUFFER(false, "float8 types are not supported by dlpack");
|
||||
break;
|
||||
case ScalarType::Float4_e2m1fn_x2:
|
||||
dtype.code = DLDataTypeCode::kDLFloat4_e2m1fn;
|
||||
dtype.lanes = 2;
|
||||
dtype.bits = 4;
|
||||
TORCH_CHECK_BUFFER(false, "float4 types are not supported by dlpack");
|
||||
break;
|
||||
case ScalarType::QInt8:
|
||||
case ScalarType::QUInt8:
|
||||
@ -187,11 +177,7 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat
|
||||
|
||||
ScalarType toScalarType(const DLDataType& dtype) {
|
||||
ScalarType stype = ScalarType::Undefined;
|
||||
if (dtype.code != DLDataTypeCode::kDLFloat4_e2m1fn) {
|
||||
TORCH_CHECK_BUFFER(
|
||||
dtype.lanes == 1,
|
||||
"ATen does not support lanes != 1 for dtype code", std::to_string(dtype.code));
|
||||
}
|
||||
TORCH_CHECK_BUFFER(dtype.lanes == 1, "ATen does not support lanes != 1");
|
||||
switch (dtype.code) {
|
||||
case DLDataTypeCode::kDLUInt:
|
||||
switch (dtype.bits) {
|
||||
@ -283,73 +269,6 @@ ScalarType toScalarType(const DLDataType& dtype) {
|
||||
false, "Unsupported kDLBool bits ", std::to_string(dtype.bits));
|
||||
}
|
||||
break;
|
||||
case DLDataTypeCode::kDLFloat8_e5m2:
|
||||
switch (dtype.bits) {
|
||||
case 8:
|
||||
stype = ScalarType::Float8_e5m2;
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(
|
||||
false, "Unsupported kDLFloat8_e5m2 bits ", std::to_string(dtype.bits));
|
||||
}
|
||||
break;
|
||||
case DLDataTypeCode::kDLFloat8_e5m2fnuz:
|
||||
switch (dtype.bits) {
|
||||
case 8:
|
||||
stype = ScalarType::Float8_e5m2fnuz;
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(
|
||||
false, "Unsupported kDLFloat8_e5m2fnuz bits ", std::to_string(dtype.bits));
|
||||
}
|
||||
break;
|
||||
case DLDataTypeCode::kDLFloat8_e4m3fn:
|
||||
switch (dtype.bits) {
|
||||
case 8:
|
||||
stype = ScalarType::Float8_e4m3fn;
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(
|
||||
false, "Unsupported kDLFloat8_e4m3fn bits ", std::to_string(dtype.bits));
|
||||
}
|
||||
break;
|
||||
case DLDataTypeCode::kDLFloat8_e4m3fnuz:
|
||||
switch (dtype.bits) {
|
||||
case 8:
|
||||
stype = ScalarType::Float8_e4m3fnuz;
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(
|
||||
false, "Unsupported kDLFloat8_e4m3fnuz bits ", std::to_string(dtype.bits));
|
||||
}
|
||||
break;
|
||||
case DLDataTypeCode::kDLFloat8_e8m0fnu:
|
||||
switch (dtype.bits) {
|
||||
case 8:
|
||||
stype = ScalarType::Float8_e8m0fnu;
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(
|
||||
false, "Unsupported kDLFloat8_e8m0fnu bits ", std::to_string(dtype.bits));
|
||||
}
|
||||
break;
|
||||
case DLDataTypeCode::kDLFloat4_e2m1fn:
|
||||
switch (dtype.bits) {
|
||||
case 4:
|
||||
switch (dtype.lanes) {
|
||||
case 2:
|
||||
stype = ScalarType::Float4_e2m1fn_x2;
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(
|
||||
false, "Unsupported kDLFloat4_e2m1fn lanes ", std::to_string(dtype.lanes));
|
||||
}
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(
|
||||
false, "Unsupported kDLFloat4_e2m1fn bits ", std::to_string(dtype.bits));
|
||||
}
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(false, "Unsupported code ", std::to_string(dtype.code));
|
||||
}
|
||||
@ -401,13 +320,30 @@ T* toDLPackImpl(const Tensor& src) {
|
||||
// The following code detects whether the src follows
|
||||
// a continuous pattern. If the src follows such pattern (common-case)
|
||||
// then we do not need to normalize the strides.
|
||||
bool need_normalize_strides = src.dim() == 1 && src.size(0) == 1 && src.stride(0) != 1;
|
||||
bool need_normalize_strides = false;
|
||||
int64_t expected_stride = 1;
|
||||
for (int i = src.dim() - 1; i >= 0; i--) {
|
||||
// detect if we do not meet continuous pattern
|
||||
// and the size is 1, so there is opportunity to normalize
|
||||
if (src.stride(i) != expected_stride && src.size(i) == 1) {
|
||||
need_normalize_strides = true;
|
||||
break;
|
||||
}
|
||||
expected_stride *= src.size(i);
|
||||
}
|
||||
|
||||
// less common case, try normalizing the strides
|
||||
if (need_normalize_strides) {
|
||||
// create a new tensor with possibly normalized strides
|
||||
// gh-83069
|
||||
auto shape = src.sizes();
|
||||
view = src.as_strided(shape, {1}, src.storage_offset());
|
||||
auto strides = src.strides().vec();
|
||||
for (int i = 0; i < src.dim(); i++) {
|
||||
if (shape[i] < 2) {
|
||||
strides[i] = 1;
|
||||
}
|
||||
}
|
||||
view = src.as_strided(shape, strides, src.storage_offset());
|
||||
}
|
||||
|
||||
ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
|
||||
@ -418,8 +354,8 @@ T* toDLPackImpl(const Tensor& src) {
|
||||
atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device());
|
||||
atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim());
|
||||
atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
|
||||
atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(view.sizes().data());
|
||||
atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(view.strides().data());
|
||||
atDLMTensor->tensor.dl_tensor.shape = view.sizes().data();
|
||||
atDLMTensor->tensor.dl_tensor.strides = view.strides().data();
|
||||
atDLMTensor->tensor.dl_tensor.byte_offset = 0;
|
||||
fillVersion(&atDLMTensor->tensor);
|
||||
|
||||
|
||||
@ -1637,7 +1637,9 @@ bool gemm_and_bias(
|
||||
if (activation == GEMMAndBiasActivationEpilogue::RELU) {
|
||||
epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
|
||||
} else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
|
||||
#if CUDA_VERSION >= 11040 || defined(USE_ROCM)
|
||||
epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (bias != nullptr) {
|
||||
@ -1929,6 +1931,7 @@ void scaled_gemm(
|
||||
bool use_fast_accum) {
|
||||
// Note: see `cublasCommonArgs` for various non-intuitive manupulations
|
||||
// of input arguments to this function.
|
||||
#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
|
||||
const auto computeType = CUBLAS_COMPUTE_32F;
|
||||
const auto scaleType = CUDA_R_32F;
|
||||
const float alpha_val = 1.0;
|
||||
@ -1951,8 +1954,8 @@ void scaled_gemm(
|
||||
#if ROCM_VERSION >= 70000
|
||||
if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) {
|
||||
// TODO: add constraints based on hipblaslt internals
|
||||
TORCH_CHECK((m % 16 == 0) && (n % 16 == 0) && (k % 128 == 0),
|
||||
"M, N must be multiples of 16 and K should be multiple of 128 for MX format. "
|
||||
TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
|
||||
"Matrix dimensions must be multiples of 32 for MX format. "
|
||||
"Got m=", m, ", n=", n, ", k=", k);
|
||||
}
|
||||
#endif
|
||||
@ -2130,6 +2133,8 @@ void scaled_gemm(
|
||||
" scaleType ",
|
||||
scaleType);
|
||||
return;
|
||||
#endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM)
|
||||
TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above");
|
||||
}
|
||||
|
||||
void int8_gemm(
|
||||
|
||||
@ -266,14 +266,11 @@ CUDAGeneratorImpl::CUDAGeneratorImpl(
|
||||
* See Note [Acquire lock when using random generators]
|
||||
*/
|
||||
void CUDAGeneratorImpl::set_current_seed(uint64_t seed) {
|
||||
if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
|
||||
state_->seed_ = seed;
|
||||
state_->philox_offset_per_thread_ = 0;
|
||||
no_reset_rnn_state_.clear();
|
||||
} else {
|
||||
TORCH_CHECK(state_->seed_ == seed, "CUDAGeneratorImpl::set_current_seed can be called during stream capture only if new seed is the same as the original seed.");
|
||||
// no-op case
|
||||
}
|
||||
at::cuda::assertNotCapturing(
|
||||
"Cannot call CUDAGeneratorImpl::set_current_seed");
|
||||
state_->seed_ = seed;
|
||||
state_->philox_offset_per_thread_ = 0;
|
||||
no_reset_rnn_state_.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -302,6 +299,9 @@ uint64_t CUDAGeneratorImpl::get_offset() const {
|
||||
* Gets the current seed of CUDAGeneratorImpl.
|
||||
*/
|
||||
uint64_t CUDAGeneratorImpl::current_seed() const {
|
||||
// Debatable if current_seed() should be allowed in captured regions.
|
||||
// Conservatively disallow it for now.
|
||||
at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::current_seed");
|
||||
return state_->seed_;
|
||||
}
|
||||
|
||||
@ -346,6 +346,8 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
|
||||
* and size of the internal state.
|
||||
*/
|
||||
void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
|
||||
at::cuda::assertNotCapturing(
|
||||
"Please ensure to utilize the CUDAGeneratorImpl::set_state_index method during capturing.");
|
||||
static const size_t seed_size = sizeof(uint64_t);
|
||||
static const size_t offset_size = sizeof(int64_t);
|
||||
static const size_t total_size = seed_size + offset_size;
|
||||
@ -400,27 +402,15 @@ c10::intrusive_ptr<c10::GeneratorImpl> CUDAGeneratorImpl::graphsafe_get_state()
|
||||
*/
|
||||
void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
|
||||
// see Note [Why enforce RNG offset % 4 == 0?]
|
||||
|
||||
// Note: If you use CUDNN RNN's, calling
|
||||
// set_philox_offset_per_thread instead of set_offset will cause the
|
||||
// cudnn RNN rng state to become stale.
|
||||
TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4");
|
||||
if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
|
||||
state_->philox_offset_per_thread_ = offset;
|
||||
} else {
|
||||
state_->offset_intragraph_ = offset;
|
||||
}
|
||||
state_->philox_offset_per_thread_ = offset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl.
|
||||
*/
|
||||
uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const {
|
||||
if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
|
||||
return state_->philox_offset_per_thread_;
|
||||
} else {
|
||||
return state_->offset_intragraph_;
|
||||
}
|
||||
return state_->philox_offset_per_thread_;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -19,7 +19,7 @@
|
||||
#define DLPACK_MAJOR_VERSION 1
|
||||
|
||||
/*! \brief The current minor version of dlpack */
|
||||
#define DLPACK_MINOR_VERSION 1
|
||||
#define DLPACK_MINOR_VERSION 0
|
||||
|
||||
/*! \brief DLPACK_DLL prefix for windows */
|
||||
#ifdef _WIN32
|
||||
@ -32,7 +32,9 @@
|
||||
#define DLPACK_DLL
|
||||
#endif
|
||||
|
||||
// NOLINTNEXTLINE(modernize-deprecated-headers)
|
||||
#include <stdint.h>
|
||||
// NOLINTNEXTLINE(modernize-deprecated-headers)
|
||||
#include <stddef.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
@ -157,26 +159,6 @@ typedef enum {
|
||||
kDLComplex = 5U,
|
||||
/*! \brief boolean */
|
||||
kDLBool = 6U,
|
||||
/*! \brief FP8 data types */
|
||||
kDLFloat8_e3m4 = 7U,
|
||||
kDLFloat8_e4m3 = 8U,
|
||||
kDLFloat8_e4m3b11fnuz = 9U,
|
||||
kDLFloat8_e4m3fn = 10U,
|
||||
kDLFloat8_e4m3fnuz = 11U,
|
||||
kDLFloat8_e5m2 = 12U,
|
||||
kDLFloat8_e5m2fnuz = 13U,
|
||||
kDLFloat8_e8m0fnu = 14U,
|
||||
/*! \brief FP6 data types
|
||||
* Setting bits != 6 is currently unspecified, and the producer must ensure it is set
|
||||
* while the consumer must stop importing if the value is unexpected.
|
||||
*/
|
||||
kDLFloat6_e2m3fn = 15U,
|
||||
kDLFloat6_e3m2fn = 16U,
|
||||
/*! \brief FP4 data types
|
||||
* Setting bits != 4 is currently unspecified, and the producer must ensure it is set
|
||||
* while the consumer must stop importing if the value is unexpected.
|
||||
*/
|
||||
kDLFloat4_e2m1fn = 17U,
|
||||
} DLDataTypeCode;
|
||||
|
||||
/*!
|
||||
@ -190,12 +172,6 @@ typedef enum {
|
||||
* - int8: type_code = 0, bits = 8, lanes = 1
|
||||
* - std::complex<float>: type_code = 5, bits = 64, lanes = 1
|
||||
* - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
|
||||
* - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory)
|
||||
* - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory)
|
||||
* - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory)
|
||||
*
|
||||
* When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e.,
|
||||
* for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element.
|
||||
*/
|
||||
typedef struct {
|
||||
/*!
|
||||
@ -253,12 +229,12 @@ typedef struct {
|
||||
/*! \brief The data type of the pointer*/
|
||||
DLDataType dtype;
|
||||
/*! \brief The shape of the tensor */
|
||||
int64_t* shape;
|
||||
const int64_t* shape;
|
||||
/*!
|
||||
* \brief strides of the tensor (in number of elements, not bytes)
|
||||
* can be NULL, indicating tensor is compact and row-majored.
|
||||
*/
|
||||
int64_t* strides;
|
||||
const int64_t* strides;
|
||||
/*! \brief The offset in bytes to the beginning pointer to data */
|
||||
uint64_t byte_offset;
|
||||
} DLTensor;
|
||||
@ -293,7 +269,7 @@ typedef struct DLManagedTensor {
|
||||
void (*deleter)(struct DLManagedTensor * self);
|
||||
} DLManagedTensor;
|
||||
|
||||
// bit masks used in the DLManagedTensorVersioned
|
||||
// bit masks used in in the DLManagedTensorVersioned
|
||||
|
||||
/*! \brief bit mask to indicate that the tensor is read only. */
|
||||
#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
|
||||
@ -306,14 +282,6 @@ typedef struct DLManagedTensor {
|
||||
*/
|
||||
#define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL)
|
||||
|
||||
/*
|
||||
* \brief bit mask to indicate that whether a sub-byte type is packed or padded.
|
||||
*
|
||||
* The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can
|
||||
* be set by the producer to signal that a tensor of sub-byte type is padded.
|
||||
*/
|
||||
#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL)
|
||||
|
||||
/*!
|
||||
* \brief A versioned and managed C Tensor object, manage memory of DLTensor.
|
||||
*
|
||||
|
||||
@ -171,8 +171,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
|
||||
|
||||
POINTWISE_BOXED(fill_.Scalar);
|
||||
POINTWISE_BOXED(zero_);
|
||||
// This is special because this op doesn't return anything
|
||||
m.impl("_assert_tensor_metadata", native::_assert_tensor_metadata);
|
||||
|
||||
#undef UNARY_POINTWISE
|
||||
#undef UNARY_POINTWISE_ALL
|
||||
|
||||
@ -81,7 +81,7 @@ Tensor math_channel_shuffle(const Tensor& self, int64_t groups) {
|
||||
// TODO: contiguous can be made to preserve the memory format
|
||||
// of the input. However since the above reshape clobbers h and w
|
||||
// it may not be safe to do that, since channels_last contiguous
|
||||
// may think oc and the last dim correspond to h,w?
|
||||
// may think oc and and the last dim correspond to h,w?
|
||||
// It is not clear, however from initial looking around it feels that
|
||||
// this may not be correct.
|
||||
// In this case channels last will likely require custom implementation
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
#pragma once
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/Config.h>
|
||||
#include <cstdint>
|
||||
|
||||
@ -97,38 +97,43 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
|
||||
int64_t nDims = self.dim();
|
||||
TORCH_CHECK(nDims >= 2, "dimensions must larger than 1");
|
||||
|
||||
auto height = self.sym_size(0);
|
||||
auto width = self.sym_size(1);
|
||||
int64_t height = self.size(0);
|
||||
int64_t width = self.size(1);
|
||||
|
||||
if (nDims > 2) {
|
||||
int64_t dim1 = height;
|
||||
for (const auto i : c10::irange(1, nDims)) {
|
||||
if (self.sym_size(i) != height) {
|
||||
if (self.size(i) != dim1) {
|
||||
TORCH_CHECK(false, "all dimensions of input must be of equal length");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto storage_offset = self.sym_storage_offset();
|
||||
auto size = std::min(height, width);
|
||||
int64_t storage_offset = self.storage_offset();
|
||||
std::vector<int64_t> sizes;
|
||||
std::vector<int64_t> strides;
|
||||
int64_t size = std::min(height, width);
|
||||
|
||||
int64_t stride = 0;
|
||||
for (const auto i : c10::irange(nDims)) {
|
||||
stride += self.stride(i);
|
||||
}
|
||||
std::vector<SymInt> strides{stride};
|
||||
std::vector<SymInt> sizes{size};
|
||||
strides.push_back(stride);
|
||||
sizes.push_back(size);
|
||||
|
||||
auto main_diag = self.as_strided_symint(sizes, strides, storage_offset);
|
||||
auto main_diag = self.as_strided(sizes, strides, storage_offset);
|
||||
main_diag.fill_(fill_value);
|
||||
|
||||
if (wrap && nDims == 2 && height > width + 1) {
|
||||
auto step = width + 1;
|
||||
auto wrap_size = ((self.numel() + step - 1) / step) - size;
|
||||
std::vector<SymInt> wrap_sizes{wrap_size};
|
||||
std::vector<int64_t> wrap_sizes;
|
||||
|
||||
auto offset = self.stride(0) * (width + 1);
|
||||
int64_t step = width + 1;
|
||||
int64_t wrap_size = ((self.numel() + step - 1) / step) - size;
|
||||
wrap_sizes.push_back(wrap_size);
|
||||
|
||||
auto wrap_diag = self.as_strided_symint(wrap_sizes, strides, storage_offset + offset);
|
||||
int64_t offset = self.stride(0) * (width + 1);
|
||||
|
||||
auto wrap_diag = self.as_strided(wrap_sizes, strides, storage_offset + offset);
|
||||
wrap_diag.fill_(fill_value);
|
||||
}
|
||||
|
||||
|
||||
@ -23,6 +23,8 @@ Tensor& max_unpooling2d_forward_out_cpu(
|
||||
// Nondeterministic with duplicate indices
|
||||
at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out");
|
||||
|
||||
auto oheight = output_size[0];
|
||||
auto owidth = output_size[1];
|
||||
TORCH_CHECK(
|
||||
indices_.scalar_type() == at::ScalarType::Long,
|
||||
"elements in indices should be type int64 but got: ", indices_.scalar_type());
|
||||
@ -43,9 +45,6 @@ Tensor& max_unpooling2d_forward_out_cpu(
|
||||
self_.sizes(), " with dimension ", i , " being empty.");
|
||||
}
|
||||
|
||||
auto oheight = output_size[0];
|
||||
auto owidth = output_size[1];
|
||||
|
||||
auto memory_format = self_.suggest_memory_format();
|
||||
auto self = self_.contiguous(memory_format);
|
||||
auto indices = indices_.contiguous(memory_format);
|
||||
|
||||
@ -73,7 +73,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value)
|
||||
for (const auto i : c10::irange((size_t)l_pad)) {
|
||||
auto pad_idx = pad.size() - ((i + 1) * 2);
|
||||
auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
|
||||
TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
|
||||
TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
|
||||
pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, "
|
||||
"which is invalid. Check dimension ", l_diff + i, " of your input.");
|
||||
new_shape.emplace_back(new_dim);
|
||||
|
||||
@ -85,11 +85,11 @@ void cpu_max_unpool(
|
||||
if constexpr (is_3d) {
|
||||
TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
|
||||
" (output volumes are of size ", output_depth,
|
||||
"x", output_height, "x", output_width, ")");
|
||||
"x", output_height, "x", output_width);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
|
||||
" (output volumes are of size ", output_height,
|
||||
"x", output_width, ")");
|
||||
"x", output_width);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1138,14 +1138,9 @@ bool is_blockwise_1x16_scaling(const at::Tensor& t, const at::Tensor& scale) {
|
||||
bool is_blockwise_1x32_scaling(const at::Tensor& t, const at::Tensor& scale) {
|
||||
// TODO: We might want to enforce some structure on the shapes of the scale
|
||||
// tensors
|
||||
bool is_fp8_path = (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
|
||||
&& scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4));
|
||||
bool is_packed_fp4_path = false;
|
||||
#ifdef USE_ROCM
|
||||
is_packed_fp4_path = (t.scalar_type() == ScalarType::Float4_e2m1fn_x2 && scale.scalar_type() == at::kFloat8_e8m0fnu
|
||||
&& scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1) * 2, 32), 4));
|
||||
#endif
|
||||
return (is_fp8_path || is_packed_fp4_path) && scale.is_contiguous();
|
||||
return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
|
||||
&& scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4)
|
||||
&& scale.is_contiguous());
|
||||
}
|
||||
|
||||
bool is_blockwise_1x128_scaling(const at::Tensor& t, const at::Tensor& scale) {
|
||||
@ -1386,15 +1381,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
|
||||
"Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
|
||||
|
||||
int packed_factor = 1;
|
||||
if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
|
||||
// For float4 data type, each byte stores two 4-bit floating-point values,
|
||||
// effectively packing two elements into one byte.
|
||||
packed_factor = 2;
|
||||
}
|
||||
TORCH_CHECK(mat1.size(0) % 16 == 0 && (mat1.size(1) * packed_factor) % 128 == 0 &&
|
||||
mat2.size(1) % 16 == 0,
|
||||
"M, N must be multiples of 16 and K must be multiple of 128 for block-wise scaling");
|
||||
TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
|
||||
mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
|
||||
"Matrix dimensions must be multiples of 32 for block-wise scaling");
|
||||
|
||||
TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
|
||||
out.scalar_type() == ScalarType::Half,
|
||||
|
||||
@ -999,41 +999,12 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
|
||||
dtypes[i] = iter.dtype(i);
|
||||
}
|
||||
auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
|
||||
#ifdef USE_ROCM
|
||||
constexpr int grp_sz = 128;
|
||||
launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
|
||||
if (unrl) {
|
||||
auto offsets0 = offset_calc.get(idx);
|
||||
auto offsets1 = offset_calc.get(idx + grp_sz);
|
||||
auto offsets2 = offset_calc.get(idx + grp_sz * 2);
|
||||
auto offsets3 = offset_calc.get(idx + grp_sz * 3);
|
||||
void* out0 = data[0] + offsets0[0];
|
||||
void* out1 = data[0] + offsets1[0];
|
||||
void* out2 = data[0] + offsets2[0];
|
||||
void* out3 = data[0] + offsets3[0];
|
||||
arg0_t result0 = invoke(f, &data[1], &offsets0[1], &dtypes[1], 1);
|
||||
arg0_t result1 = invoke(f, &data[1], &offsets1[1], &dtypes[1], 1);
|
||||
arg0_t result2 = invoke(f, &data[1], &offsets2[1], &dtypes[1], 1);
|
||||
arg0_t result3 = invoke(f, &data[1], &offsets3[1], &dtypes[1], 1);
|
||||
c10::cast_and_store<arg0_t>(dtypes[0], out0, result0);
|
||||
c10::cast_and_store<arg0_t>(dtypes[0], out1, result1);
|
||||
c10::cast_and_store<arg0_t>(dtypes[0], out2, result2);
|
||||
c10::cast_and_store<arg0_t>(dtypes[0], out3, result3);
|
||||
} else {
|
||||
auto offsets = offset_calc.get(idx);
|
||||
void* out = data[0] + offsets[0];
|
||||
arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
|
||||
c10::cast_and_store<arg0_t>(dtypes[0], out, result);
|
||||
}
|
||||
});
|
||||
#else
|
||||
launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) {
|
||||
auto offsets = offset_calc.get(idx);
|
||||
void* out = data[0] + offsets[0];
|
||||
arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
|
||||
c10::cast_and_store<arg0_t>(dtypes[0], out, result);
|
||||
});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -51,7 +51,7 @@ std::vector<Tensor> foreach_tensor_list_op(
|
||||
Op<opmath_t>(),
|
||||
alpha.to<opmath_t>());
|
||||
|
||||
return std::move(tensor_lists[2]);
|
||||
return tensor_lists[2];
|
||||
}
|
||||
|
||||
template <typename T, template <class> class Op>
|
||||
|
||||
@ -45,7 +45,7 @@ std::vector<Tensor> foreach_binary_op(
|
||||
/* res_arg_index */ 1>(),
|
||||
Op<opmath_t>(),
|
||||
scalar.to<opmath_t>());
|
||||
return std::move(tensor_lists[1]);
|
||||
return tensor_lists[1];
|
||||
}
|
||||
|
||||
template <typename T, template <class> class Op>
|
||||
|
||||
@ -33,7 +33,7 @@ std::vector<Tensor> foreach_binary_op(
|
||||
}
|
||||
|
||||
tensor_lists.emplace_back(tensors.vec());
|
||||
tensor_lists.emplace_back(std::move(vec_res));
|
||||
tensor_lists.emplace_back(vec_res);
|
||||
|
||||
using opmath_t = at::opmath_type<T>;
|
||||
multi_tensor_apply<2, opmath_t>(
|
||||
@ -46,7 +46,7 @@ std::vector<Tensor> foreach_binary_op(
|
||||
/* res_arg_index */ 1>(),
|
||||
|
||||
Op<opmath_t>());
|
||||
return std::move(tensor_lists[1]);
|
||||
return tensor_lists[1];
|
||||
}
|
||||
|
||||
template <typename T, template <class> class Op>
|
||||
|
||||
@ -56,7 +56,7 @@ std::vector<Tensor> foreach_binary_op(
|
||||
Op<opmath_t>(),
|
||||
scalar.data_ptr<T>(),
|
||||
alpha.to<opmath_t>());
|
||||
return std::move(tensor_lists[1]);
|
||||
return tensor_lists[1];
|
||||
}
|
||||
|
||||
template <typename T, template <class> class Op>
|
||||
|
||||
@ -57,7 +57,7 @@ std::vector<Tensor> foreach_pointwise_op(
|
||||
scalar.to<opmath_t>());
|
||||
});
|
||||
|
||||
return std::move(tensor_lists[3]);
|
||||
return tensor_lists[3];
|
||||
}
|
||||
|
||||
template <template <class> class Op>
|
||||
@ -160,7 +160,7 @@ std::vector<Tensor> foreach_pointwise_op(
|
||||
Op<opmath_t>());
|
||||
});
|
||||
|
||||
return std::move(tensor_lists[3]);
|
||||
return tensor_lists[3];
|
||||
}
|
||||
|
||||
#define FOREACH_POINTWISE_OP_SCALAR(NAME, OP) \
|
||||
|
||||
@ -37,7 +37,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
|
||||
vec_res.emplace_back(at::native::empty_like(t));
|
||||
}
|
||||
std::vector<std::vector<at::Tensor>> tensor_lists{
|
||||
tensors1.vec(), tensors2.vec(), tensors3.vec(), std::move(vec_res)};
|
||||
tensors1.vec(), tensors2.vec(), tensors3.vec(), vec_res};
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
|
||||
at::ScalarType::Half,
|
||||
@ -56,7 +56,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
|
||||
LerpFunctor<opmath_t>());
|
||||
});
|
||||
|
||||
return std::move(tensor_lists[3]);
|
||||
return tensor_lists[3];
|
||||
}
|
||||
|
||||
void foreach_tensor_lerp_ternary_cuda_(
|
||||
@ -104,7 +104,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
|
||||
vec_res.emplace_back(at::native::empty_like(t));
|
||||
}
|
||||
std::vector<std::vector<at::Tensor>> tensor_lists{
|
||||
tensors1.vec(), tensors2.vec(), std::move(vec_res)};
|
||||
tensors1.vec(), tensors2.vec(), vec_res};
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
|
||||
at::ScalarType::Half,
|
||||
@ -124,7 +124,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
|
||||
weight.to<opmath_t>());
|
||||
});
|
||||
|
||||
return std::move(tensor_lists[2]);
|
||||
return tensor_lists[2];
|
||||
}
|
||||
|
||||
void foreach_tensor_lerp_list_cuda_(
|
||||
@ -173,7 +173,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
|
||||
vec_res.emplace_back(at::native::empty_like(t));
|
||||
}
|
||||
std::vector<std::vector<at::Tensor>> tensor_lists{
|
||||
tensors1.vec(), tensors2.vec(), std::move(vec_res)};
|
||||
tensors1.vec(), tensors2.vec(), vec_res};
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
|
||||
at::ScalarType::Half,
|
||||
@ -193,7 +193,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
|
||||
LerpFunctor<opmath_t>());
|
||||
});
|
||||
|
||||
return std::move(tensor_lists[2]);
|
||||
return tensor_lists[2];
|
||||
}
|
||||
|
||||
void foreach_tensor_lerp_scalarlist_cuda_(
|
||||
|
||||
@ -67,7 +67,7 @@ std::vector<Tensor> foreach_unary_op(TensorList tensors) {
|
||||
/* res_arg_index */ 1>(),
|
||||
Op<opmath_t>());
|
||||
|
||||
return std::move(tensor_lists[1]);
|
||||
return tensor_lists[1];
|
||||
}
|
||||
|
||||
template <typename scalar_t, template <class> class Op>
|
||||
|
||||
@ -125,6 +125,8 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
|
||||
TORCH_CHECK(
|
||||
indices_.scalar_type() == at::ScalarType::Long,
|
||||
"elements in indices should be type int64 but got: ", indices_.scalar_type());
|
||||
auto oheight = output_size[0];
|
||||
auto owidth = output_size[1];
|
||||
|
||||
TensorArg output_arg{output, "output", 1}, self_arg{self_, "self_", 2},
|
||||
indices_arg{indices_, "indices_", 3};
|
||||
@ -147,9 +149,6 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
|
||||
output_size.size() == 2,
|
||||
"There should be exactly two elements (height, width) in output_size, but got ", output_size.size(), " elements.");
|
||||
|
||||
auto oheight = output_size[0];
|
||||
auto owidth = output_size[1];
|
||||
|
||||
int64_t dimw = 2;
|
||||
int64_t dimh = 1;
|
||||
int64_t numBatch = 1;
|
||||
@ -218,6 +217,9 @@ static void max_unpooling3d_shape_check(
|
||||
IntArrayRef stride,
|
||||
IntArrayRef padding,
|
||||
const char *fn_name) {
|
||||
int64_t oT = output_size[0];
|
||||
int64_t oH = output_size[1];
|
||||
int64_t oW = output_size[2];
|
||||
TORCH_CHECK(
|
||||
indices.scalar_type() == at::ScalarType::Long,
|
||||
"elements in indices should be type int64 but got: ", indices.scalar_type());
|
||||
@ -248,10 +250,6 @@ static void max_unpooling3d_shape_check(
|
||||
"strides should be greater than zero, but got stride: ",
|
||||
stride);
|
||||
|
||||
int64_t oT = output_size[0];
|
||||
int64_t oH = output_size[1];
|
||||
int64_t oW = output_size[2];
|
||||
|
||||
int dimw = 3;
|
||||
int dimh = 2;
|
||||
int dimt = 1;
|
||||
@ -404,6 +402,8 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
|
||||
const Tensor& indices_,
|
||||
IntArrayRef output_size,
|
||||
Tensor& grad_input) {
|
||||
int64_t oheight = output_size[0];
|
||||
int64_t owidth = output_size[1];
|
||||
TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
|
||||
TORCH_CHECK(
|
||||
indices_.scalar_type() == at::ScalarType::Long,
|
||||
@ -426,9 +426,6 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
|
||||
|
||||
TORCH_CHECK(output_size.size() == 2, "output_size must have two elements, got size: ", output_size.size());
|
||||
|
||||
int64_t oheight = output_size[0];
|
||||
int64_t owidth = output_size[1];
|
||||
|
||||
int64_t nInputCols, nInputRows, nInputPlane;
|
||||
|
||||
int dimw = 2;
|
||||
@ -508,14 +505,13 @@ at::Tensor& max_unpooling3d_backward_out_cuda(const Tensor& grad_output_,
|
||||
IntArrayRef padding,
|
||||
Tensor& grad_input) {
|
||||
TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
|
||||
|
||||
max_unpooling3d_shape_check(
|
||||
self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()");
|
||||
|
||||
int64_t oT = output_size[0];
|
||||
int64_t oH = output_size[1];
|
||||
int64_t oW = output_size[2];
|
||||
|
||||
max_unpooling3d_shape_check(
|
||||
self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()");
|
||||
|
||||
int batchSize = 0;
|
||||
int inputSlices = 0;
|
||||
int inputTime = 0;
|
||||
|
||||
@ -300,6 +300,8 @@ void nonzero_static_cuda_out_impl(
|
||||
int64_t size,
|
||||
int64_t fill_value,
|
||||
Tensor& out) {
|
||||
#if defined(CUDA_VERSION) || defined(USE_ROCM)
|
||||
|
||||
Tensor self_contiguous_ = self.contiguous();
|
||||
// see comment in nonzero_cuda_out_impl on reqs for out
|
||||
bool out_correct_size =
|
||||
@ -375,6 +377,9 @@ void nonzero_static_cuda_out_impl(
|
||||
if (need_to_copy) {
|
||||
out.copy_(out_temp);
|
||||
}
|
||||
#else
|
||||
TORCH_CHECK(false, "Nonzero_static is not supported for cuda <= 11.4");
|
||||
#endif
|
||||
}
|
||||
|
||||
Tensor& nonzero_out_cuda(const Tensor& self, Tensor& out) {
|
||||
|
||||
@ -226,38 +226,6 @@ __global__ void CatArrayBatchedCopy_contig(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T, typename IndexType, int Dims, int batch_size, int stride_size, int alignment, int elems_per_vec>
|
||||
__global__ void CatArrayBatchedCopy_vectorized(
|
||||
char* output,
|
||||
CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs,
|
||||
TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
|
||||
const int concatDim,
|
||||
IndexType trailingSize) {
|
||||
|
||||
IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
IndexType nElements = inputs.nElements[blockIdx.y] / elems_per_vec;
|
||||
|
||||
if(tid >= nElements) return;
|
||||
|
||||
const char * data = (char*)inputs.input[blockIdx.y];
|
||||
IndexType offset = inputs.offset[blockIdx.y] * trailingSize / elems_per_vec;
|
||||
IndexType dimSize = inputs.dimSize[blockIdx.y] * trailingSize / elems_per_vec;
|
||||
int64_t dataOffset = (int64_t)offset * alignment; // in bytes
|
||||
|
||||
IndexType stride = gridDim.x * blockDim.x;
|
||||
|
||||
while( tid < nElements){
|
||||
int64_t elementOffset = (int64_t)CatArrIndexToOffset<IndexType, Dims>::compute(
|
||||
os.tensorSize, os.tensorStride, dimSize, concatDim, tid) * alignment; // in bytes
|
||||
auto vec = at::native::memory::ld_vec<alignment>(data + (int64_t)alignment * tid);
|
||||
at::native::memory::st_vec<alignment>(output + dataOffset + elementOffset, vec);
|
||||
tid += stride;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads
|
||||
to improve memory bandwidth throughput.
|
||||
@ -328,27 +296,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
scalar_t *data = (scalar_t *)(out.mutable_data_ptr());
|
||||
CatArrInputTensorMetadata<scalar_t, unsigned int, batch_size, stride_size> catMetaData;
|
||||
TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam;
|
||||
// If all batches are contiguous we can call a specialized implementation
|
||||
// which requires the input tensor addresses to be aligned to a
|
||||
// 16 Byte boundary.
|
||||
|
||||
constexpr bool isContig = stride_size == 1;
|
||||
bool isAligned = true;
|
||||
constexpr int alignment = 16;
|
||||
|
||||
// Next, let's initialize the size, stride arrays for the output Tensor.
|
||||
// for contig case, we'll canonicalize output strides, so that
|
||||
// we don't have arbitrary strides for dims of size 0
|
||||
size_t stride0 = 1;
|
||||
if (memory_format == c10::MemoryFormat::Contiguous) {
|
||||
for (int i = nDims - 1; i >= 0; --i) {
|
||||
for (int i = 0; i < nDims; ++i) {
|
||||
outputParam.tensorSize[i] = out.size(i);
|
||||
if (isContig) {
|
||||
outputParam.tensorStride[i] = stride0;
|
||||
stride0 *= out.size(i);
|
||||
} else {
|
||||
outputParam.tensorStride[i] = out.stride(i);
|
||||
}
|
||||
outputParam.tensorStride[i] = out.stride(i);
|
||||
}
|
||||
} else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) {
|
||||
// permute the semantics of dims from NCHW to NHWC so that the input
|
||||
@ -367,15 +320,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
|
||||
at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
// If all batches are contiguous we can call a specialized implementation
|
||||
// which requires the input tensor addresses to be aligned to a
|
||||
// 16 Byte boundary.
|
||||
|
||||
// for channels last computing slice size correctly is much more involved, so we never send it
|
||||
// on the fully vectorized path
|
||||
// we need output stride in cat dimension to be multiple of alignment,
|
||||
// if we ever use it to compute offsets
|
||||
// for catting in 0th dimension it doesn't matter
|
||||
bool isInOutAligned = isContig && at::native::memory::get_alignment(data) >= alignment &&
|
||||
memory_format == c10::MemoryFormat::Contiguous && (dimension == 0 ||
|
||||
outputParam.tensorStride[dimension - 1] * sizeof(scalar_t) % alignment == 0);
|
||||
bool isContig = true;
|
||||
bool isAligned = true;
|
||||
unsigned int max_elements_per_tensor = 0;
|
||||
|
||||
// Now we loop
|
||||
@ -391,16 +341,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
// high-dimensional tensor
|
||||
if (inputs[i+batchCounter].get().numel() > 0) {
|
||||
dimSize = inputs[i+batchCounter].get().size(dimension);
|
||||
if (isInOutAligned) {
|
||||
auto t = inputs[i+batchCounter].get();
|
||||
// similarly to output stride, we cannot trust stride value to
|
||||
// determine slice size if the corresponding dimension is 1
|
||||
// we have to multiply all the subsequent sizes
|
||||
int64_t slice_size = dimension == 0 ? t.numel() : t.sizes()[dimension - 1] != 1 ?
|
||||
t.strides()[dimension - 1] : c10::multiply_integers(t.sizes().begin() + dimension, t.sizes().end());
|
||||
slice_size *= sizeof(scalar_t);
|
||||
isInOutAligned &= (slice_size % alignment == 0);
|
||||
}
|
||||
}
|
||||
|
||||
catMetaData.input[batchCounter] = (scalar_t*)(inputs[i+batchCounter].get().const_data_ptr());
|
||||
@ -411,12 +351,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
#ifdef USE_ROCM
|
||||
// On ROCm, CatArrayBatchedCopy_contig is faster
|
||||
isAligned = false;
|
||||
isInOutAligned = false;
|
||||
#else
|
||||
// If at least one of the inputs is not aligned, we can't call the
|
||||
// CatArrayBatchedCopy_alignedK_contig
|
||||
isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]);
|
||||
isInOutAligned &= at::native::memory::get_alignment(catMetaData.input[batchCounter]) >= alignment;
|
||||
#endif
|
||||
|
||||
if (stride_size > 1) {
|
||||
@ -427,6 +365,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j];
|
||||
}
|
||||
catMetaData.isContiguous[batchCounter] = false;
|
||||
isContig = false;
|
||||
} else {
|
||||
catMetaData.isContiguous[batchCounter] = true;
|
||||
}
|
||||
@ -449,13 +388,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
max_elements_per_tensor, batchCounter);
|
||||
#else
|
||||
dim3 applyBlock, catGrid;
|
||||
if (isInOutAligned) {
|
||||
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, alignment>(
|
||||
max_elements_per_tensor, batchCounter);
|
||||
} else if (isContig && isAligned && sizeof(scalar_t) > 2) {
|
||||
if (isContig && sizeof(scalar_t) > 2) {
|
||||
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_16>(
|
||||
max_elements_per_tensor, batchCounter);
|
||||
} else if (isContig && isAligned && sizeof(scalar_t) == 2) {
|
||||
} else if (isContig && sizeof(scalar_t) == 2) {
|
||||
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_8>(
|
||||
max_elements_per_tensor, batchCounter);
|
||||
} else {
|
||||
@ -463,30 +399,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
getCatGrid(batchCounter, catGrid);
|
||||
}
|
||||
#endif
|
||||
int32_t trailingSize;
|
||||
TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
|
||||
if (isInOutAligned) {
|
||||
// in this case we can and should flatten the tensors after the cat dim
|
||||
// we want to view the tensors as if consisting of `alignment`-sized elements
|
||||
// however, we might not be able to cleanly divide just the last dim -
|
||||
// it might not be the multiple of alignment.
|
||||
// however, we know that the full concatted slice is multiple of alignment,
|
||||
// so if we flatten all the dims after and including concat dim,
|
||||
// it will be divisible by alignment
|
||||
// then we need to divide last out size by elems_per_vec,
|
||||
// and divide all strides except last by elems_per_vec (last stride is 1 always)
|
||||
// for input, we will fix up the sizes and strides in the kernel directly
|
||||
kernelOutputParam = outputParam;
|
||||
nDims = dimension + 1;
|
||||
constexpr auto elems_per_vec = alignment / sizeof(scalar_t);
|
||||
auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1];
|
||||
kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec;
|
||||
trailingSize = outputParam.tensorStride[dimension];
|
||||
kernelOutputParam.tensorStride[dimension] = 1;
|
||||
for (int i = 0; i < dimension; ++i) {
|
||||
kernelOutputParam.tensorStride[i] /= elems_per_vec;
|
||||
}
|
||||
}
|
||||
|
||||
if (memory_format != c10::MemoryFormat::Contiguous) {
|
||||
switch (dimension) {
|
||||
@ -501,12 +413,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
}
|
||||
// Template Declarations for dim = 1, 2, 3, 4
|
||||
#define HANDLE_CASE(DIMS) \
|
||||
if (isInOutAligned) {\
|
||||
constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \
|
||||
CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\
|
||||
catGrid, applyBlock, 0, stream.stream()>>>(\
|
||||
(char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\
|
||||
} else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
|
||||
if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
|
||||
CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
|
||||
catGrid, applyBlock, 0, stream.stream()>>>(\
|
||||
data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
|
||||
|
||||
@ -221,9 +221,22 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_
|
||||
std::optional<CuFFTConfig> uncached_plan;
|
||||
const CuFFTConfig * config = nullptr;
|
||||
|
||||
// Workaround for gh-63152, gh-58724
|
||||
// Bluestein plans in CUDA 11.1 (cufft 10.3) cannot be re-used
|
||||
// Bluestein's algorithm is only used when a size has large prime factors,
|
||||
// sizes with only small prime factors can still be cached
|
||||
if (plan_cache.max_size() > 0) {
|
||||
bool use_caching = true;
|
||||
#ifdef CUFFT_VERSION
|
||||
if constexpr (10300 <= CUFFT_VERSION && CUFFT_VERSION < 10400) {
|
||||
// Only cache plans for transforms with small prime factors
|
||||
use_caching = std::none_of(
|
||||
signal_size.begin() + 1, signal_size.end(), [](int64_t dim_size) {
|
||||
return has_large_prime_factor(dim_size);
|
||||
});
|
||||
}
|
||||
#endif
|
||||
|
||||
if (use_caching && plan_cache.max_size() > 0) {
|
||||
guard.lock();
|
||||
if (plan_cache.max_size() > 0) { // check again after acquiring the lock
|
||||
config = &plan_cache.lookup(Params);
|
||||
|
||||
@ -5,20 +5,12 @@
|
||||
|
||||
namespace at::native {
|
||||
|
||||
__global__ void weight_int8pack_mm_kernel(
|
||||
const float* x,
|
||||
const int8_t* w,
|
||||
const float* scale,
|
||||
float* out,
|
||||
int B,
|
||||
int K,
|
||||
int N) {
|
||||
__global__ void weight_int8pack_mm_kernel(const float* x, const int8_t* w, const float* scale, float* out, int B, int K, int N) {
|
||||
// one thread per output element: [B, N]
|
||||
int b = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int n = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (b >= B || n >= N)
|
||||
return;
|
||||
if (b >= B || n >= N) return;
|
||||
|
||||
float acc = 0.0f;
|
||||
for (int k = 0; k < K; ++k) {
|
||||
@ -28,11 +20,7 @@ __global__ void weight_int8pack_mm_kernel(
|
||||
out[b * N + n] = acc * scale[n];
|
||||
}
|
||||
|
||||
void launch_weight_int8pack_mm_cuda_kernel(
|
||||
const Tensor& x,
|
||||
const Tensor& w_int8,
|
||||
const Tensor& scale,
|
||||
Tensor& out) {
|
||||
void launch_weight_int8pack_mm_cuda_kernel(const Tensor& x, const Tensor& w_int8, const Tensor& scale, Tensor& out) {
|
||||
const int B = x.size(0);
|
||||
const int K = x.size(1);
|
||||
const int N = w_int8.size(0);
|
||||
@ -47,16 +35,12 @@ void launch_weight_int8pack_mm_cuda_kernel(
|
||||
w_int8.data_ptr<int8_t>(),
|
||||
scale.data_ptr<float>(),
|
||||
out.data_ptr<float>(),
|
||||
B,
|
||||
K,
|
||||
N);
|
||||
B, K, N);
|
||||
}
|
||||
|
||||
|
||||
// Main GPU entry point
|
||||
at::Tensor _weight_int8pack_mm_cuda(
|
||||
const at::Tensor& x,
|
||||
const at::Tensor& w_int8,
|
||||
const at::Tensor& scale) {
|
||||
at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int8, const at::Tensor& scale) {
|
||||
// --- Check inputs ---
|
||||
TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor");
|
||||
TORCH_CHECK(w_int8.is_cuda(), "w must be a CUDA tensor");
|
||||
@ -66,16 +50,12 @@ at::Tensor _weight_int8pack_mm_cuda(
|
||||
TORCH_CHECK(w_int8.dim() == 2, "w must be 2D");
|
||||
TORCH_CHECK(scale.dim() == 1, "scale must be 1D");
|
||||
|
||||
TORCH_CHECK(
|
||||
x.size(1) == w_int8.size(1),
|
||||
"K dimension mismatch: x.size(1) != w.size(1)");
|
||||
TORCH_CHECK(
|
||||
w_int8.size(0) == scale.size(0),
|
||||
"Output dim mismatch: w.size(0) != scale.size(0)");
|
||||
TORCH_CHECK(x.size(1) == w_int8.size(1), "K dimension mismatch: x.size(1) != w.size(1)");
|
||||
TORCH_CHECK(w_int8.size(0) == scale.size(0), "Output dim mismatch: w.size(0) != scale.size(0)");
|
||||
|
||||
// --- Determine shapes ---
|
||||
auto B = x.size(0); // batch size
|
||||
auto N = w_int8.size(0); // output dim
|
||||
auto B = x.size(0); // batch size
|
||||
auto N = w_int8.size(0); // output dim
|
||||
|
||||
// Ensure inputs are in the correct types for the kernel
|
||||
auto x_f32 = x.to(at::kFloat);
|
||||
@ -83,13 +63,12 @@ at::Tensor _weight_int8pack_mm_cuda(
|
||||
auto scale_f32 = scale.to(at::kFloat);
|
||||
|
||||
// --- Allocate output ---
|
||||
auto out = at::empty({B, N}, x_f32.options());
|
||||
auto out = at::empty({B, N}, x.options().dtype(at::kFloat));
|
||||
|
||||
// --- Launch kernel ---
|
||||
launch_weight_int8pack_mm_cuda_kernel(
|
||||
x_f32, w_int8_contiguous, scale_f32, out);
|
||||
launch_weight_int8pack_mm_cuda_kernel(x_f32, w_int8_contiguous, scale_f32, out);
|
||||
|
||||
return out.to(x.dtype());
|
||||
return out;
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -482,9 +482,7 @@ auto build_graph(
|
||||
auto scaled_dot_product_flash_attention_options =
|
||||
fe::graph::SDPA_attributes()
|
||||
.set_name("CUDNN_SDPA")
|
||||
.set_is_inference(return_softmaxstats == false)
|
||||
// TODO(eqy): switch to this API once cuDNN FE is upgraded
|
||||
// .set_generate_stats(return_softmaxstats)
|
||||
.set_generate_stats(return_softmaxstats)
|
||||
.set_causal_mask(is_causal)
|
||||
.set_attn_scale(attn_scale);
|
||||
if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
|
||||
@ -704,9 +702,7 @@ auto build_graph_nestedtensor(
|
||||
auto scaled_dot_product_flash_attention_options =
|
||||
fe::graph::SDPA_attributes()
|
||||
.set_name("CUDNN_SDPA_NESTEDTENSOR")
|
||||
.set_is_inference(return_softmaxstats == false)
|
||||
// TODO(eqy): switch to this API once cuDNN FE is upgraded
|
||||
// .set_generate_stats(return_softmaxstats)
|
||||
.set_generate_stats(return_softmaxstats)
|
||||
.set_causal_mask(is_causal)
|
||||
.set_attn_scale(attn_scale)
|
||||
.set_seq_len_q(SEQ_LEN_Q_)
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <ATen/div_rtn.h>
|
||||
#include <c10/util/safe_numerics.h>
|
||||
|
||||
namespace at::native {
|
||||
|
||||
@ -55,14 +54,6 @@ inline void col2im_shape_check(
|
||||
|
||||
int64_t batch_dim = (ndim == 3) ? 0 : -1;
|
||||
int64_t n_input_plane = input.size(batch_dim + 1);
|
||||
uint64_t prod_kernel_size = 1;
|
||||
|
||||
TORCH_CHECK(!c10::mul_overflows(static_cast<uint64_t>(kernel_width), static_cast<uint64_t>(kernel_height), &prod_kernel_size),
|
||||
"Given kernel_width = ",
|
||||
kernel_width,
|
||||
" and kernel_height = ",
|
||||
kernel_height,
|
||||
" the product of kernel_width and kernel_height overflowed.");
|
||||
|
||||
if (n_input_plane % (kernel_width * kernel_height) != 0) {
|
||||
TORCH_CHECK(false,
|
||||
|
||||
@ -559,60 +559,4 @@ Tensor _int_mm_xpu(const Tensor& self, const Tensor& mat2) {
|
||||
at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt));
|
||||
return _int_mm_out_xpu(self, mat2, result);
|
||||
}
|
||||
|
||||
Tensor _weight_int8pack_mm_xpu(
|
||||
const Tensor& A,
|
||||
const Tensor& B,
|
||||
const Tensor& scales) {
|
||||
auto M = A.size(0);
|
||||
auto N = B.size(0);
|
||||
auto K = A.size(1);
|
||||
|
||||
TORCH_CHECK(
|
||||
A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat,
|
||||
" : expect A to be either 32-bit or 16-bit float tensor.");
|
||||
TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor.");
|
||||
TORCH_CHECK(
|
||||
A.stride(1) == 1, " : A must be contiguous on the last dimension.");
|
||||
TORCH_CHECK(B.dtype() == kChar, " : expect B to be int8 tensor.");
|
||||
TORCH_CHECK(B.is_contiguous(), " : expect B to be contiguous.");
|
||||
TORCH_CHECK(B.size(1) == K, " : expect B.size(1) == ", K);
|
||||
|
||||
TORCH_CHECK(
|
||||
scales.dim() == 1 && scales.size(0) == N,
|
||||
" : expect scales to be 1d tensor with size ",
|
||||
N);
|
||||
|
||||
auto C = at::empty({M, N}, A.options());
|
||||
|
||||
// --- Launch kernel ---
|
||||
Tensor bias = at::Tensor();
|
||||
Tensor mat2_zero_points = at::Tensor();
|
||||
Tensor non_const_scales = scales;
|
||||
auto post_op_args = torch::List<std::optional<at::Scalar>>();
|
||||
|
||||
at::native::onednn::quantized_matmul(
|
||||
A.contiguous(),
|
||||
1.0,
|
||||
0,
|
||||
B,
|
||||
non_const_scales,
|
||||
mat2_zero_points,
|
||||
bias,
|
||||
C,
|
||||
1.0,
|
||||
0,
|
||||
C.scalar_type(),
|
||||
/*other*/ std::nullopt,
|
||||
/*other scale*/ 1.0,
|
||||
/*other zp*/ 0,
|
||||
/*binary post op*/ "none",
|
||||
/*binary alpha*/ 1.0,
|
||||
/*post_op_name*/ "none",
|
||||
post_op_args,
|
||||
/*post_op_algorithm*/ "none",
|
||||
/*m2_trans*/ false);
|
||||
|
||||
return C;
|
||||
}
|
||||
} // namespace at::native
|
||||
|
||||
@ -110,9 +110,8 @@ void quantized_matmul(
|
||||
// [Note] Quantized Matrix Multiplication at XPU
|
||||
// The following code integrates oneDNN quantized gemm. The quantization
|
||||
// config we support:
|
||||
// activation: s8, u8, fp16, bf16, fp32; per tensor calibrated;
|
||||
// symmetric&asymmetric weight: s8; per_tensor/per_channel calibrated;
|
||||
// symmetric
|
||||
// activation: s8&u8; per tensor calibrated; symmetric&asymmetric
|
||||
// weight: s8; per_tensor/per_channel calibrated; symmetric
|
||||
auto attr = Attr(static_cast<float>(1.0 / output_scale), output_zero_point);
|
||||
construct_attr_by_post_op(
|
||||
binary_post_op,
|
||||
|
||||
@ -1,25 +0,0 @@
|
||||
#pragma once
|
||||
#include <c10/metal/common.h>
|
||||
|
||||
#ifdef __METAL__
|
||||
enum class EmbeddingBagMode { SUM = 0, MEAN, MAX };
|
||||
#else
|
||||
#include <ATen/native/EmbeddingBag.h>
|
||||
using at::native::EmbeddingBagMode;
|
||||
#endif
|
||||
|
||||
template <typename idx_type_t = uint32_t>
|
||||
struct EmbeddingBagParams {
|
||||
::c10::metal::array<idx_type_t, 2> weight_strides;
|
||||
::c10::metal::array<idx_type_t, 2> output_strides;
|
||||
::c10::metal::array<idx_type_t, 2> max_indices_strides;
|
||||
|
||||
idx_type_t per_sample_weights_stride;
|
||||
|
||||
idx_type_t num_indices;
|
||||
idx_type_t num_bags;
|
||||
idx_type_t feature_size;
|
||||
|
||||
EmbeddingBagMode mode;
|
||||
int64_t padding_idx;
|
||||
};
|
||||
@ -1,261 +0,0 @@
|
||||
#include <ATen/native/mps/kernels/EmbeddingBag.h>
|
||||
#include <c10/metal/utils.h>
|
||||
#include <metal_array>
|
||||
#include <metal_stdlib>
|
||||
|
||||
using namespace metal;
|
||||
using namespace c10::metal;
|
||||
|
||||
template <EmbeddingBagMode M, typename T>
|
||||
struct ReductionOpInit {
|
||||
inline opmath_t<T> operator()() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct ReductionOpInit<EmbeddingBagMode::MAX, T> {
|
||||
inline opmath_t<T> operator()() {
|
||||
return static_cast<opmath_t<T>>(-INFINITY);
|
||||
}
|
||||
};
|
||||
|
||||
template <EmbeddingBagMode M, typename T>
|
||||
struct ReductionOp {
|
||||
inline opmath_t<T> operator()(
|
||||
opmath_t<T> weight_val,
|
||||
opmath_t<T> out_val,
|
||||
bool is_first) {
|
||||
return weight_val + out_val;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct ReductionOp<EmbeddingBagMode::MAX, T> {
|
||||
inline opmath_t<T> operator()(
|
||||
opmath_t<T> weight_val,
|
||||
opmath_t<T> out_val,
|
||||
bool is_first) {
|
||||
return (is_first || weight_val > out_val) ? weight_val : out_val;
|
||||
}
|
||||
};
|
||||
|
||||
template <EmbeddingBagMode M, typename T>
|
||||
struct MaybeApplyPerSampleWeight {
|
||||
inline opmath_t<T> operator()(
|
||||
opmath_t<T> weight_val,
|
||||
uint32_t per_sample_weights_index,
|
||||
constant T* per_sample_weights,
|
||||
uint32_t per_sample_weights_stride) {
|
||||
return weight_val;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct MaybeApplyPerSampleWeight<EmbeddingBagMode::SUM, T> {
|
||||
inline opmath_t<T> operator()(
|
||||
opmath_t<T> weight_val,
|
||||
uint32_t per_sample_weights_index,
|
||||
constant T* per_sample_weights,
|
||||
uint32_t per_sample_weights_stride) {
|
||||
if (per_sample_weights_stride) {
|
||||
T per_sample_weight = per_sample_weights
|
||||
[per_sample_weights_stride * per_sample_weights_index];
|
||||
return static_cast<opmath_t<T>>(per_sample_weight) * weight_val;
|
||||
} else {
|
||||
return weight_val;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <EmbeddingBagMode M, typename T, typename I>
|
||||
struct MaybeCalcMaxIndex {
|
||||
inline void operator()(
|
||||
opmath_t<T> weight_val,
|
||||
opmath_t<T> out_val,
|
||||
bool is_first,
|
||||
thread I& max_idx,
|
||||
I weight_idx,
|
||||
bool pad) {}
|
||||
};
|
||||
|
||||
template <typename T, typename I>
|
||||
struct MaybeCalcMaxIndex<EmbeddingBagMode::MAX, T, I> {
|
||||
inline void operator()(
|
||||
opmath_t<T> weight_val,
|
||||
opmath_t<T> out_val,
|
||||
bool is_first,
|
||||
thread I& max_idx,
|
||||
I weight_idx,
|
||||
bool pad) {
|
||||
max_idx = !pad && (is_first || weight_val > out_val) ? weight_idx : max_idx;
|
||||
}
|
||||
};
|
||||
|
||||
template <EmbeddingBagMode M, typename T>
|
||||
struct ReductionOpFinal {
|
||||
inline T operator()(opmath_t<T> val, uint32_t) {
|
||||
return static_cast<T>(val);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct ReductionOpFinal<EmbeddingBagMode::MEAN, T> {
|
||||
inline T operator()(opmath_t<T> val, uint32_t count) {
|
||||
auto out = val / count;
|
||||
return static_cast<T>((count == 0) ? 0 : out);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct ReductionOpFinal<EmbeddingBagMode::MAX, T> {
|
||||
inline T operator()(opmath_t<T> val, uint32_t count) {
|
||||
return static_cast<T>((count == 0) ? 0 : val);
|
||||
}
|
||||
};
|
||||
|
||||
template <EmbeddingBagMode M, typename I>
|
||||
struct MaybeWriteMaxIndex {
|
||||
inline void operator()(
|
||||
device I*,
|
||||
const constant ::c10::metal::array<uint32_t, 2>&,
|
||||
uint32_t,
|
||||
uint32_t,
|
||||
I) {}
|
||||
};
|
||||
|
||||
template <typename I>
|
||||
struct MaybeWriteMaxIndex<EmbeddingBagMode::MAX, I> {
|
||||
inline void operator()(
|
||||
device I* max_indices,
|
||||
const constant ::c10::metal::array<uint32_t, 2>& max_indices_strides,
|
||||
uint32_t bag_idx,
|
||||
uint32_t feature_idx,
|
||||
I max_idx) {
|
||||
max_indices
|
||||
[bag_idx * max_indices_strides[0] +
|
||||
feature_idx * max_indices_strides[1]] = max_idx;
|
||||
}
|
||||
};
|
||||
|
||||
template <EmbeddingBagMode M, typename T, typename I>
|
||||
void embedding_bag_impl(
|
||||
constant T* weight,
|
||||
constant I* indices,
|
||||
constant I* offsets,
|
||||
constant T* per_sample_weights,
|
||||
device T* output,
|
||||
device I* offset2bag,
|
||||
device I* bag_size,
|
||||
device I* max_indices,
|
||||
constant EmbeddingBagParams<uint32_t>& params,
|
||||
uint tid) {
|
||||
auto num_indices = params.num_indices;
|
||||
auto num_bags = params.num_bags;
|
||||
auto feature_size = params.feature_size;
|
||||
auto padding_idx = params.padding_idx;
|
||||
auto per_sample_weights_stride = params.per_sample_weights_stride;
|
||||
constant auto& output_strides = params.output_strides;
|
||||
constant auto& weight_strides = params.weight_strides;
|
||||
constant auto& max_indices_strides = params.max_indices_strides;
|
||||
|
||||
auto bag_idx = tid / feature_size;
|
||||
auto feature_idx = tid % feature_size;
|
||||
|
||||
uint32_t offsets_end = min(bag_idx + 1, num_bags - 1);
|
||||
bool is_last_bag = bag_idx + 1 == num_bags;
|
||||
uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]);
|
||||
uint32_t indices_end = is_last_bag * (num_indices) +
|
||||
(!is_last_bag) * (static_cast<uint32_t>(offsets[offsets_end]));
|
||||
|
||||
auto out_val = ReductionOpInit<M, T>()();
|
||||
|
||||
uint32_t bag_size_ = 0;
|
||||
I max_idx = 0;
|
||||
|
||||
for (uint32_t indices_idx = indices_start; indices_idx < indices_end;
|
||||
indices_idx++) {
|
||||
I weight_idx = indices[indices_idx];
|
||||
bool pad = (weight_idx == padding_idx);
|
||||
auto weight_val = static_cast<opmath_t<T>>(
|
||||
weight
|
||||
[static_cast<uint32_t>(weight_idx) * weight_strides[0] +
|
||||
feature_idx * weight_strides[1]]);
|
||||
|
||||
weight_val = MaybeApplyPerSampleWeight<M, T>()(
|
||||
weight_val, indices_idx, per_sample_weights, per_sample_weights_stride);
|
||||
|
||||
auto new_out_val = ReductionOp<M, T>()(weight_val, out_val, bag_size_ == 0);
|
||||
|
||||
MaybeCalcMaxIndex<M, T, I>()(
|
||||
weight_val, out_val, bag_size_ == 0, max_idx, weight_idx, pad);
|
||||
|
||||
out_val = pad ? out_val : new_out_val;
|
||||
offset2bag[indices_idx] = bag_idx;
|
||||
bag_size_ += static_cast<uint32_t>(!pad);
|
||||
}
|
||||
|
||||
output[bag_idx * output_strides[0] + feature_idx * output_strides[1]] =
|
||||
ReductionOpFinal<M, T>()(out_val, bag_size_);
|
||||
|
||||
bag_size[bag_idx] = bag_size_;
|
||||
|
||||
MaybeWriteMaxIndex<M, I>()(
|
||||
max_indices, max_indices_strides, bag_idx, feature_idx, max_idx);
|
||||
}
|
||||
|
||||
#define DISPATCH_IMPL(MODE) \
|
||||
return embedding_bag_impl<MODE>( \
|
||||
weight, \
|
||||
indices, \
|
||||
offsets, \
|
||||
per_sample_weights, \
|
||||
output, \
|
||||
offset2bag, \
|
||||
bag_size, \
|
||||
max_indices, \
|
||||
params, \
|
||||
tid)
|
||||
|
||||
template <typename T, typename I>
|
||||
kernel void embedding_bag(
|
||||
constant T* weight [[buffer(0)]],
|
||||
constant I* indices [[buffer(1)]],
|
||||
constant I* offsets [[buffer(2)]],
|
||||
constant T* per_sample_weights [[buffer(3)]],
|
||||
device T* output [[buffer(4)]],
|
||||
device I* offset2bag [[buffer(5)]],
|
||||
device I* bag_size [[buffer(6)]],
|
||||
device I* max_indices [[buffer(7)]],
|
||||
constant EmbeddingBagParams<uint32_t>& params [[buffer(8)]],
|
||||
uint tid [[thread_position_in_grid]]) {
|
||||
switch (params.mode) {
|
||||
case EmbeddingBagMode::SUM:
|
||||
DISPATCH_IMPL(EmbeddingBagMode::SUM);
|
||||
case EmbeddingBagMode::MEAN:
|
||||
DISPATCH_IMPL(EmbeddingBagMode::MEAN);
|
||||
case EmbeddingBagMode::MAX:
|
||||
DISPATCH_IMPL(EmbeddingBagMode::MAX);
|
||||
}
|
||||
}
|
||||
|
||||
#define REGISTER_EMBEDDING_BAG_OP(T, I) \
|
||||
template [[host_name("embedding_bag_" #T "_" #I)]] \
|
||||
kernel void embedding_bag<T, I>( \
|
||||
constant T * weight [[buffer(0)]], \
|
||||
constant I * indices [[buffer(1)]], \
|
||||
constant I * offsets [[buffer(2)]], \
|
||||
constant T * per_sample_weights [[buffer(3)]], \
|
||||
device T * output [[buffer(4)]], \
|
||||
device I * offset2bag [[buffer(5)]], \
|
||||
device I * bag_size [[buffer(6)]], \
|
||||
device I * max_indices [[buffer(7)]], \
|
||||
constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]], \
|
||||
uint tid [[thread_position_in_grid]]);
|
||||
|
||||
REGISTER_EMBEDDING_BAG_OP(float, int);
|
||||
REGISTER_EMBEDDING_BAG_OP(float, long);
|
||||
REGISTER_EMBEDDING_BAG_OP(half, int);
|
||||
REGISTER_EMBEDDING_BAG_OP(half, long);
|
||||
REGISTER_EMBEDDING_BAG_OP(bfloat, int);
|
||||
REGISTER_EMBEDDING_BAG_OP(bfloat, long);
|
||||
@ -198,7 +198,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
|
||||
if (input_t.is_contiguous(memory_format) && output_t.is_contiguous(memory_format) && is_macOS_15_0_or_newer) {
|
||||
inputNDArray = getMPSNDArray(input_t, inputShape);
|
||||
outputNDArray = getMPSNDArray(output_t, outputShape);
|
||||
outputNDArray = getMPSNDArray(*output, outputShape);
|
||||
}
|
||||
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
@ -302,7 +302,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
}
|
||||
}
|
||||
auto outputPlaceholder = outputNDArray ? Placeholder(cachedGraph->outputTensor_, outputNDArray)
|
||||
: Placeholder(cachedGraph->outputTensor_, output_t);
|
||||
: Placeholder(cachedGraph->outputTensor_, *output);
|
||||
|
||||
NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
|
||||
[[[NSMutableDictionary alloc] initWithCapacity:3] autorelease];
|
||||
@ -315,7 +315,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
|
||||
}
|
||||
|
||||
return output_t;
|
||||
return *output;
|
||||
}
|
||||
|
||||
Tensor _mps_convolution(const Tensor& input_t,
|
||||
|
||||
@ -1,180 +0,0 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/native/EmbeddingBag.h>
|
||||
#include <ATen/native/Pool.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/native/mps/kernels/EmbeddingBag.h>
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/_embedding_bag_forward_only_native.h>
|
||||
#include <ATen/ops/_embedding_bag_native.h>
|
||||
#include <ATen/ops/empty.h>
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
|
||||
#ifndef PYTORCH_JIT_COMPILE_SHADERS
|
||||
static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
|
||||
#else
|
||||
#include <ATen/native/mps/EmbeddingBag_metallib.h>
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
std::pair<Tensor, Tensor> promoteIndicesAndOffsets(const Tensor& indices, const Tensor& offsets) {
|
||||
const auto commonType = promoteTypes(offsets.scalar_type(), indices.scalar_type());
|
||||
return {indices.scalar_type() == commonType ? indices : indices.toType(commonType),
|
||||
offsets.scalar_type() == commonType ? offsets : offsets.toType(commonType)};
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace mps {
|
||||
|
||||
static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
|
||||
const Tensor& weight,
|
||||
const Tensor& indices_,
|
||||
const Tensor& offsets_,
|
||||
const bool scale_grad_by_freq,
|
||||
const int64_t mode,
|
||||
bool sparse,
|
||||
const std::optional<Tensor>& per_sample_weights_opt,
|
||||
bool include_last_offset,
|
||||
int64_t padding_idx) {
|
||||
TORCH_CHECK(indices_.dim() == 1, "input has to be a 1D Tensor, but got Tensor of dimension ", indices_.dim());
|
||||
if (indices_.dim() == 1) {
|
||||
TORCH_CHECK(offsets_.dim() == 1, "offsets has to be a 1D Tensor, but got Tensor of dimension ", offsets_.dim());
|
||||
}
|
||||
TORCH_CHECK(weight.dim() == 2, "weight has to be a 2D Tensor, but got Tensor of dimension ", weight.dim());
|
||||
|
||||
Tensor indices, offsets;
|
||||
std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_);
|
||||
auto indices_arg = TensorArg(indices, "indices", 1);
|
||||
checkScalarTypes("embedding_bag_mps", indices_arg, {kLong, kInt});
|
||||
auto offsets_arg = TensorArg(offsets, "offsets", 1);
|
||||
checkScalarTypes("embedding_bag_mps", offsets_arg, {kLong, kInt});
|
||||
checkSameType("embedding_bag_mps", indices_arg, offsets_arg);
|
||||
auto weight_arg = TensorArg(weight, "weight", 1);
|
||||
|
||||
int64_t num_indices = indices.size(0);
|
||||
int64_t num_bags = offsets.size(0);
|
||||
if (include_last_offset) {
|
||||
TORCH_CHECK(num_bags >= 1, "include_last_offset: number of offsets should be at least 1");
|
||||
num_bags -= 1;
|
||||
}
|
||||
int64_t feature_size = weight.size(1);
|
||||
|
||||
auto bag_size = at::empty({num_bags}, indices.options());
|
||||
auto offset2bag = at::empty({indices.size(0)}, indices.options());
|
||||
auto output = at::empty({num_bags, feature_size}, weight.options());
|
||||
|
||||
Tensor max_indices;
|
||||
|
||||
if (mode == EmbeddingBagMode::MAX) {
|
||||
max_indices = at::empty({num_bags, feature_size}, indices.options());
|
||||
} else {
|
||||
max_indices = at::empty({0}, indices.options());
|
||||
}
|
||||
|
||||
EmbeddingBagParams<uint32_t> params;
|
||||
|
||||
for (const auto dim : c10::irange(weight.dim())) {
|
||||
params.weight_strides[dim] = safe_downcast<uint32_t, int64_t>(weight.stride(dim));
|
||||
params.output_strides[dim] = safe_downcast<uint32_t, int64_t>(output.stride(dim));
|
||||
|
||||
if (mode == EmbeddingBagMode::MAX) {
|
||||
params.max_indices_strides[dim] = safe_downcast<uint32_t, int64_t>(max_indices.stride(dim));
|
||||
}
|
||||
}
|
||||
|
||||
bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
|
||||
params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
|
||||
|
||||
params.num_indices = num_indices;
|
||||
params.num_bags = num_bags;
|
||||
params.feature_size = feature_size;
|
||||
params.mode = static_cast<EmbeddingBagMode>(mode);
|
||||
params.padding_idx = padding_idx;
|
||||
|
||||
auto num_threads = output.numel();
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
|
||||
auto pipeline_state = lib.getPipelineStateForFunc(
|
||||
fmt::format("embedding_bag_{}_{}", scalarToMetalTypeString(weight), scalarToMetalTypeString(indices)));
|
||||
|
||||
getMPSProfiler().beginProfileKernel(pipeline_state, "embedding_bag", {weight, indices, offsets});
|
||||
[computeEncoder setComputePipelineState:pipeline_state];
|
||||
mtl_setArgs(computeEncoder,
|
||||
weight,
|
||||
indices,
|
||||
offsets,
|
||||
use_per_sample_weights ? per_sample_weights_opt : std::nullopt,
|
||||
output,
|
||||
offset2bag,
|
||||
bag_size,
|
||||
max_indices,
|
||||
params);
|
||||
|
||||
mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
|
||||
getMPSProfiler().endProfileKernel(pipeline_state);
|
||||
}
|
||||
});
|
||||
|
||||
return std::tuple<Tensor, Tensor, Tensor, Tensor>(
|
||||
std::move(output), std::move(offset2bag), std::move(bag_size), std::move(max_indices));
|
||||
}
|
||||
|
||||
} // namespace mps
|
||||
|
||||
std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps(const Tensor& weight,
|
||||
const Tensor& indices,
|
||||
const Tensor& offsets,
|
||||
const bool scale_grad_by_freq,
|
||||
const int64_t mode,
|
||||
bool sparse,
|
||||
const std::optional<Tensor>& per_sample_weights_opt,
|
||||
bool include_last_offset,
|
||||
int64_t padding_idx) {
|
||||
return mps::_embedding_bag_mps_impl(weight,
|
||||
indices,
|
||||
offsets,
|
||||
scale_grad_by_freq,
|
||||
mode,
|
||||
sparse,
|
||||
per_sample_weights_opt,
|
||||
include_last_offset,
|
||||
padding_idx);
|
||||
}
|
||||
|
||||
std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_forward_only_mps(
|
||||
const Tensor& weight,
|
||||
const Tensor& indices,
|
||||
const Tensor& offsets,
|
||||
const bool scale_grad_by_freq,
|
||||
const int64_t mode,
|
||||
bool sparse,
|
||||
const std::optional<Tensor>& per_sample_weights_opt,
|
||||
bool include_last_offset,
|
||||
int64_t padding_idx) {
|
||||
return _embedding_bag_mps(weight,
|
||||
indices,
|
||||
offsets,
|
||||
scale_grad_by_freq,
|
||||
mode,
|
||||
sparse,
|
||||
per_sample_weights_opt,
|
||||
include_last_offset,
|
||||
padding_idx);
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
@ -20,7 +20,6 @@
|
||||
#include <ATen/ops/baddbmm_native.h>
|
||||
#include <ATen/ops/bmm_native.h>
|
||||
#include <ATen/ops/cholesky_native.h>
|
||||
#include <ATen/ops/eye_native.h>
|
||||
#include <ATen/ops/linalg_cholesky_ex_native.h>
|
||||
#include <ATen/ops/linalg_inv_ex_native.h>
|
||||
#include <ATen/ops/linalg_lu_factor_ex_native.h>
|
||||
@ -497,24 +496,26 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
|
||||
using namespace mps;
|
||||
TORCH_CHECK(result.is_mps(), "Output tensor is not MPS");
|
||||
TORCH_CHECK(!A.is_complex(), "linalg_inv: not supported for complex types yet!");
|
||||
using CachedGraph = MPSUnaryCachedGraph;
|
||||
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
info.zero_();
|
||||
|
||||
if (A.numel() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!result.is_contiguous()) {
|
||||
result.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
|
||||
}
|
||||
auto A_sizes = A.sizes();
|
||||
int ndim = A.dim();
|
||||
|
||||
Tensor LU = empty_like(A, MemoryFormat::Contiguous);
|
||||
Tensor identity = eye(A.size(-2), A.size(-1), A.scalar_type(), A.options().layout(), A.device()).expand_as(A);
|
||||
Tensor LU = empty_like(A);
|
||||
Tensor identity = zeros_like(A);
|
||||
Tensor pivots = empty({A_sizes.begin(), A_sizes.end() - 1}, A.options().dtype(kInt));
|
||||
// need to do this to keep the strides of the result tensor
|
||||
// mps's solve expects row major layout, while inductor
|
||||
// expects result to be column major
|
||||
Tensor tmp = empty_like(A, MemoryFormat::Contiguous);
|
||||
linalg_solve_out_mps_impl(A, identity, true, check_errors, tmp, LU, pivots, info);
|
||||
result.copy_(tmp);
|
||||
(ndim == 2 ? identity.diagonal() : identity.diagonal(0, -2, -1)).fill_(1);
|
||||
linalg_solve_out_mps_impl(A, identity, true, check_errors, result, LU, pivots, info);
|
||||
}
|
||||
|
||||
static Tensor& mm_out_mps_impl(const Tensor& self, const Tensor& other, Tensor& output) {
|
||||
|
||||
@ -519,13 +519,6 @@ static void max_unpool_out_mps_template(const Tensor& input,
|
||||
Tensor& output,
|
||||
const int32_t pooling_dims,
|
||||
const std::string& op_name) {
|
||||
TORCH_CHECK(output_size_.size() == static_cast<size_t>(pooling_dims),
|
||||
op_name,
|
||||
"There should be exactly ",
|
||||
pooling_dims,
|
||||
" elements but got ",
|
||||
output_size_.size());
|
||||
|
||||
auto dims = input.dim();
|
||||
auto leading_dims = input.dim() - pooling_dims;
|
||||
|
||||
@ -541,18 +534,6 @@ static void max_unpool_out_mps_template(const Tensor& input,
|
||||
output.resize_(output_size, memory_format);
|
||||
output.fill_(0);
|
||||
|
||||
if (indices.defined() && indices.numel() > 0) {
|
||||
auto output_image_size = c10::multiply_integers(output_size_);
|
||||
|
||||
int64_t min_idx = indices.min().item<int64_t>();
|
||||
int64_t max_idx = indices.max().item<int64_t>();
|
||||
|
||||
if (min_idx < 0 || max_idx >= output_image_size) {
|
||||
int64_t error_idx = (min_idx < 0) ? min_idx : max_idx;
|
||||
TORCH_CHECK(false, "Found an invalid max index: ", error_idx, " for output tensor of shape ", output_size_);
|
||||
}
|
||||
}
|
||||
|
||||
id<MTLDevice> device = MPSDevice::getInstance()->device();
|
||||
MPSStream* mpsStream = getCurrentMPSStream();
|
||||
const auto numThreads = input.numel();
|
||||
|
||||
@ -2351,7 +2351,6 @@
|
||||
dispatch:
|
||||
CPU: _embedding_bag_forward_only_cpu
|
||||
CUDA: _embedding_bag_forward_only_cuda
|
||||
MPS: _embedding_bag_forward_only_mps
|
||||
autogen: _embedding_bag_forward_only.out
|
||||
|
||||
- func: _rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor)
|
||||
@ -2373,7 +2372,6 @@
|
||||
dispatch:
|
||||
CPU: _embedding_bag_cpu
|
||||
CUDA: _embedding_bag_cuda
|
||||
MPS: _embedding_bag_mps
|
||||
autogen: _embedding_bag.out
|
||||
tags: core
|
||||
|
||||
@ -3858,7 +3856,7 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
structured: True
|
||||
dispatch:
|
||||
CPU, CUDA, MTIA: aminmax_out
|
||||
CPU, CUDA: aminmax_out
|
||||
MPS: aminmax_out_mps
|
||||
|
||||
- func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
|
||||
@ -3909,7 +3907,7 @@
|
||||
- func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
dispatch:
|
||||
CPU, CUDA, MTIA: amax_out
|
||||
CPU, CUDA: amax_out
|
||||
MPS: amax_out_mps
|
||||
|
||||
# Return: (Tensor output, Tensor indices)
|
||||
@ -4090,7 +4088,7 @@
|
||||
- func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
dispatch:
|
||||
CPU, CUDA, MTIA: amin_out
|
||||
CPU, CUDA: amin_out
|
||||
MPS: amin_out_mps
|
||||
|
||||
# TODO: Add this function to MPS dispatch key so that we avoid declaring it in
|
||||
@ -4243,7 +4241,6 @@
|
||||
CPU: _weight_int8pack_mm_cpu
|
||||
CUDA: _weight_int8pack_mm_cuda
|
||||
MPS: _weight_int8pack_mm_mps
|
||||
XPU: _weight_int8pack_mm_xpu
|
||||
|
||||
- func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
|
||||
python_module: sparse
|
||||
@ -4375,7 +4372,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CPU: narrow_copy_dense_cpu
|
||||
SparseCPU, SparseCUDA, SparseMPS: narrow_copy_sparse
|
||||
SparseCPU, SparseCUDA: narrow_copy_sparse
|
||||
CompositeExplicitAutogradNonFunctional: narrow_copy_dense_symint
|
||||
tags: view_copy
|
||||
|
||||
@ -6663,7 +6660,7 @@
|
||||
- func: zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: zeros_out
|
||||
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: zeros_sparse_out
|
||||
SparseCPU, SparseCUDA, SparseMeta: zeros_sparse_out
|
||||
|
||||
- func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
|
||||
dispatch:
|
||||
@ -10702,7 +10699,6 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow
|
||||
CUDA: foreach_tensor_div_list_kernel_cuda
|
||||
MTIA: foreach_tensor_div_list_kernel_mtia
|
||||
|
||||
- func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
|
||||
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
|
||||
@ -10710,7 +10706,6 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow_
|
||||
CUDA: foreach_tensor_div_list_kernel_cuda_
|
||||
MTIA: foreach_tensor_div_list_kernel_mtia_
|
||||
autogen: _foreach_div.List_out
|
||||
|
||||
- func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
|
||||
@ -10734,7 +10729,6 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow
|
||||
CUDA: foreach_tensor_div_tensor_kernel_cuda
|
||||
MTIA: foreach_tensor_div_tensor_kernel_mtia
|
||||
|
||||
- func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
|
||||
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
|
||||
@ -10742,7 +10736,6 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow_
|
||||
CUDA: foreach_tensor_div_tensor_kernel_cuda_
|
||||
MTIA: foreach_tensor_div_tensor_kernel_mtia_
|
||||
autogen: _foreach_div.Tensor_out
|
||||
|
||||
- func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
|
||||
@ -10849,7 +10842,6 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
|
||||
CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
|
||||
MTIA: foreach_tensor_maximum_scalar_kernel_mtia_
|
||||
autogen: _foreach_maximum.Scalar_out
|
||||
|
||||
# foreach_minimum/maximum dispatches to clamp_max/min
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/ceil_div.h>
|
||||
#include <ATen/native/cuda/Loops.cuh>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
@ -22,11 +21,10 @@
|
||||
namespace at::native {
|
||||
|
||||
namespace {
|
||||
template <typename T>
|
||||
__global__ void ChooseQuantizationParamsKernelImpl(
|
||||
const int64_t* fake_quant_on,
|
||||
const T* x_min,
|
||||
const T* x_max,
|
||||
const float* x_min,
|
||||
const float* x_max,
|
||||
int32_t qmin,
|
||||
int32_t qmax,
|
||||
int size,
|
||||
@ -95,44 +93,34 @@ __global__ void ChooseQuantizationParamsKernelImpl(
|
||||
}
|
||||
}
|
||||
|
||||
__device__ inline bool isinf_device(float v) {
|
||||
return ::isinf(v);
|
||||
}
|
||||
__device__ inline bool isinf_device(c10::BFloat16 v) {
|
||||
return ::isinf(static_cast<float>(v));
|
||||
}
|
||||
|
||||
// CUDA kernel to compute Moving Average Min/Max of the tensor.
|
||||
// It uses the running_min and running_max along with averaging const, c.
|
||||
// The formula used to compute the new min/max is as follows
|
||||
//
|
||||
// running_min = (1 - c) * running_min + c * x_min, if running_min != inf
|
||||
// running_min = x_min, if running_min == inf
|
||||
template <typename T>
|
||||
__global__ void MovingAverageMinMax(
|
||||
const int64_t* observer_on,
|
||||
const T* x_min,
|
||||
const T* x_max,
|
||||
T* running_min,
|
||||
T* running_max,
|
||||
const float* x_min,
|
||||
const float* x_max,
|
||||
float* running_min,
|
||||
float* running_max,
|
||||
const float averaging_const,
|
||||
const int size) {
|
||||
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (*observer_on == 1) {
|
||||
if (i < size) {
|
||||
T curr_min = x_min[i];
|
||||
T curr_max = x_max[i];
|
||||
float curr_min = x_min[i];
|
||||
float curr_max = x_max[i];
|
||||
|
||||
T averaging_const_t = static_cast<T>(averaging_const);
|
||||
float adjusted_min = ::isinf(running_min[i])
|
||||
? curr_min
|
||||
: (running_min[i]) + averaging_const * (curr_min - (running_min[i]));
|
||||
|
||||
T adjusted_min = isinf_device(running_min[i]) ? curr_min
|
||||
: (running_min[i]) +
|
||||
averaging_const_t * (curr_min - (running_min[i]));
|
||||
|
||||
T adjusted_max = isinf_device(running_max[i]) ? curr_max
|
||||
: (running_max[i]) +
|
||||
averaging_const_t * (curr_max - (running_max[i]));
|
||||
float adjusted_max = ::isinf(running_max[i])
|
||||
? curr_max
|
||||
: (running_max[i]) + averaging_const * (curr_max - (running_max[i]));
|
||||
|
||||
running_min[i] = adjusted_min;
|
||||
running_max[i] = adjusted_max;
|
||||
@ -154,51 +142,40 @@ void _calculate_moving_average(
|
||||
at::Tensor x_min, x_max;
|
||||
|
||||
int64_t* observer_on_data = observer_on.data_ptr<int64_t>();
|
||||
float* running_min_data = running_min.data_ptr<float>();
|
||||
float* running_max_data = running_max.data_ptr<float>();
|
||||
cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
if (per_row_fq) {
|
||||
std::tie(x_min, x_max) = at::aminmax(x, 1);
|
||||
float* x_min_data = x_min.data_ptr<float>();
|
||||
float* x_max_data = x_max.data_ptr<float>();
|
||||
int num_threads = std::min(size, (int64_t)512);
|
||||
const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(
|
||||
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
|
||||
scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
|
||||
scalar_t* x_max_data = x_max.data_ptr<scalar_t>();
|
||||
|
||||
scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
|
||||
scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
|
||||
|
||||
// Moving Average Min/Max observer for activations
|
||||
MovingAverageMinMax<<<num_blocks, num_threads, 0, cuda_stream>>>(
|
||||
observer_on_data,
|
||||
x_min_data,
|
||||
x_max_data,
|
||||
running_min_data,
|
||||
running_max_data,
|
||||
averaging_const,
|
||||
size);
|
||||
});
|
||||
// Moving Average Min/Max observer for activations
|
||||
MovingAverageMinMax<<<num_blocks, num_threads, 0, cuda_stream>>>(
|
||||
observer_on_data,
|
||||
x_min_data,
|
||||
x_max_data,
|
||||
running_min_data,
|
||||
running_max_data,
|
||||
averaging_const,
|
||||
size);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
} else {
|
||||
std::tie(x_min, x_max) = at::aminmax(x);
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(
|
||||
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
|
||||
scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
|
||||
scalar_t* x_max_data = x_max.data_ptr<scalar_t>();
|
||||
|
||||
scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
|
||||
scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
|
||||
|
||||
// Moving Average Min/Max observer for activations
|
||||
MovingAverageMinMax<<<1, 1, 0, cuda_stream>>>(
|
||||
observer_on_data,
|
||||
x_min_data,
|
||||
x_max_data,
|
||||
running_min_data,
|
||||
running_max_data,
|
||||
averaging_const,
|
||||
1 /*size*/);
|
||||
});
|
||||
float* x_min_data = x_min.data_ptr<float>();
|
||||
float* x_max_data = x_max.data_ptr<float>();
|
||||
// Moving Average Min/Max observer for activations
|
||||
MovingAverageMinMax<<<1, 1, 0, cuda_stream>>>(
|
||||
observer_on_data,
|
||||
x_min_data,
|
||||
x_max_data,
|
||||
running_min_data,
|
||||
running_max_data,
|
||||
averaging_const,
|
||||
1 /*size*/);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
}
|
||||
@ -221,44 +198,34 @@ void _calc_moving_avg_qparams_helper(
|
||||
cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
|
||||
int64_t* fake_quant_on_data = fake_quant_on.data_ptr<int64_t>();
|
||||
if (per_row_fq) {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(
|
||||
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
|
||||
scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
|
||||
scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
|
||||
int num_threads = std::min(size, (int64_t)512);
|
||||
const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
|
||||
ChooseQuantizationParamsKernelImpl<<<
|
||||
num_blocks,
|
||||
num_threads,
|
||||
0,
|
||||
cuda_stream>>>(
|
||||
fake_quant_on_data,
|
||||
running_min_data,
|
||||
running_max_data,
|
||||
qmin,
|
||||
qmax,
|
||||
size,
|
||||
symmetric_quant,
|
||||
scale_ptr,
|
||||
zp_ptr);
|
||||
});
|
||||
float* running_min_data = running_min.data_ptr<float>();
|
||||
float* running_max_data = running_max.data_ptr<float>();
|
||||
int num_threads = std::min(size, (int64_t)512);
|
||||
const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
|
||||
ChooseQuantizationParamsKernelImpl<<<num_blocks, num_threads, 0, cuda_stream>>>(
|
||||
fake_quant_on_data,
|
||||
running_min_data,
|
||||
running_max_data,
|
||||
qmin,
|
||||
qmax,
|
||||
size,
|
||||
symmetric_quant,
|
||||
scale_ptr,
|
||||
zp_ptr);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
} else {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(
|
||||
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
|
||||
scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
|
||||
scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
|
||||
ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>(
|
||||
fake_quant_on_data,
|
||||
running_min_data,
|
||||
running_max_data,
|
||||
qmin,
|
||||
qmax,
|
||||
1, // size
|
||||
symmetric_quant, // preserve_sparsity
|
||||
scale_ptr,
|
||||
zp_ptr);
|
||||
});
|
||||
float* running_min_data = running_min.data_ptr<float>();
|
||||
float* running_max_data = running_max.data_ptr<float>();
|
||||
ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>(
|
||||
fake_quant_on_data,
|
||||
running_min_data,
|
||||
running_max_data,
|
||||
qmin,
|
||||
qmax,
|
||||
1, // size
|
||||
symmetric_quant, // preserve_sparsity
|
||||
scale_ptr,
|
||||
zp_ptr);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
}
|
||||
|
||||
@ -64,6 +64,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
|
||||
// create sparse descriptor, dtype
|
||||
cusparseLtMatDescriptor_t sparse_input_descriptor;
|
||||
cudaDataType type;
|
||||
auto compression_factor = 9;
|
||||
|
||||
#ifdef USE_ROCM
|
||||
TORCH_CHECK(isHipSparseLtSupported());
|
||||
@ -72,6 +73,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
|
||||
switch (sparse_input.scalar_type()) {
|
||||
case at::ScalarType::Char:
|
||||
type = CUDA_R_8I;
|
||||
compression_factor = 10;
|
||||
break;
|
||||
case at::ScalarType::Half:
|
||||
type = CUDA_R_16F;
|
||||
@ -87,6 +89,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
|
||||
#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602 && !defined(USE_ROCM)
|
||||
case at::ScalarType::Float8_e4m3fn:
|
||||
type = CUDA_R_8F_E4M3;
|
||||
compression_factor = 10;
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
@ -94,6 +97,10 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
|
||||
break;
|
||||
}
|
||||
|
||||
// create a new compressed tensor with the same dtype as
|
||||
auto compressed_tensor =
|
||||
sparse_input.new_empty(sparse_input.numel() * compression_factor / 16);
|
||||
|
||||
TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
|
||||
&handle,
|
||||
&sparse_input_descriptor,
|
||||
@ -114,15 +121,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
|
||||
&compressed_size,
|
||||
&compressed_buffer_size));
|
||||
|
||||
// create a new compressed tensor with the same dtype as the input,
|
||||
// and with packed data/metadata stored in an array with original
|
||||
// number of rows, and sufficient columns to provide compressed_size
|
||||
// buffer (in bytes)
|
||||
size_t orig_m = sparse_input.size(0);
|
||||
size_t div = orig_m * sparse_input.itemsize();
|
||||
size_t new_n = (compressed_size + div - 1) / div; // floor
|
||||
auto compressed_tensor = sparse_input.new_empty({(int64_t)orig_m, (int64_t)new_n});
|
||||
|
||||
auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
|
||||
auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
@ -167,6 +165,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
|
||||
cudaDataType output_type;
|
||||
cudaDataType C_type;
|
||||
cusparseComputeType compute_type;
|
||||
auto compression_factor = 9;
|
||||
|
||||
#ifdef USE_ROCM
|
||||
TORCH_CHECK(isHipSparseLtSupported());
|
||||
@ -178,6 +177,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
|
||||
output_type = CUDA_R_8I;
|
||||
C_type = CUDA_R_8I;
|
||||
compute_type = CUSPARSE_COMPUTE_32I;
|
||||
compression_factor = 10;
|
||||
break;
|
||||
|
||||
// cuSPARSELt v0.5.2 onwards changes CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUT_16F
|
||||
@ -210,6 +210,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
|
||||
output_type = CUDA_R_8F_E4M3;
|
||||
C_type = CUDA_R_16F;
|
||||
compute_type = CUSPARSE_COMPUTE_32F;
|
||||
compression_factor = 10;
|
||||
break;
|
||||
#endif
|
||||
// cuSPARSELt <= v0.5.2 uses CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUTE_16F
|
||||
@ -299,10 +300,9 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
|
||||
}
|
||||
}
|
||||
|
||||
TORCH_INTERNAL_ASSERT(compressed_A.dim() == 2); // encoded M x S
|
||||
int64_t k = dense_B.size(0);
|
||||
int64_t n = dense_B.size(1);
|
||||
int64_t m = compressed_A.size(0);
|
||||
int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
|
||||
|
||||
// initialize sparse descriptor
|
||||
cusparseLtMatDescriptor_t sparse_input_descriptor;
|
||||
|
||||
@ -5,6 +5,51 @@
|
||||
|
||||
#include <ATen/test/allocator_clone_test.h>
|
||||
|
||||
#include <torch/csrc/cuda/CUDAPluggableAllocator.h>
|
||||
|
||||
TEST(AllocatorTestCUDA, test_clone) {
|
||||
test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
|
||||
}
|
||||
|
||||
static int called_dummy_free_0 = 0;
|
||||
static int called_dummy_free_1 = 0;
|
||||
|
||||
void* dummy_alloc_0(size_t size, int device, void* stream) {return nullptr;}
|
||||
void dummy_free_0(void* data, size_t size, int device, void* stream) {
|
||||
called_dummy_free_0++;
|
||||
}
|
||||
void dummy_free_1(void* data, size_t size, int device, void* stream) {
|
||||
called_dummy_free_1++;
|
||||
}
|
||||
|
||||
// Tests that data_ptrs have their respective deleters
|
||||
// when mixing allocators
|
||||
TEST(AllocatorTestCUDA, test_pluggable_allocator_deleters) {
|
||||
// Create a tensor with dummy_allocator_0, where dummy_free_0 is the deleter
|
||||
auto dummy_allocator_0 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_0);
|
||||
c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_0.get());
|
||||
at::Tensor a = at::empty({0}, at::TensorOptions().device(at::kCUDA));
|
||||
|
||||
// Create a tensor with dummy_allocator_1, where dummy_free_1 is the deleter
|
||||
auto dummy_allocator_1 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_1);
|
||||
c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_1.get());
|
||||
at::Tensor b = at::empty({0}, at::TensorOptions().device(at::kCUDA));
|
||||
|
||||
// Manually use a's deleter
|
||||
auto* ctx = a.storage().data_ptr().get_context();
|
||||
a.storage().data_ptr().get_deleter()(ctx);
|
||||
a.storage().mutable_data_ptr().release_context();
|
||||
|
||||
// a's deleter is dummy_free_0
|
||||
// dummy_free_0 should be called above, so called_dummy_free_0 should be 1
|
||||
ASSERT_TRUE(called_dummy_free_0 == 1);
|
||||
|
||||
// Manually use b's deleter
|
||||
ctx = b.storage().data_ptr().get_context();
|
||||
b.storage().data_ptr().get_deleter()(ctx);
|
||||
b.storage().mutable_data_ptr().release_context();
|
||||
|
||||
// b's deleter is dummy_free_1
|
||||
// dummy_free_1 should be called above, so called_dummy_free_1 should be 1
|
||||
ASSERT_TRUE(called_dummy_free_1 == 1);
|
||||
}
|
||||
|
||||
@ -42,7 +42,7 @@ TEST(MPSObjCInterfaceTest, MPSCustomKernel) {
|
||||
id<MTLLibrary> customKernelLibrary = [device newLibraryWithSource: [NSString stringWithUTF8String:CUSTOM_KERNEL]
|
||||
options: nil
|
||||
error: &error];
|
||||
TORCH_CHECK(customKernelLibrary, "Failed to create custom kernel library, error: ", error.localizedDescription.UTF8String);
|
||||
TORCH_CHECK(customKernelLibrary, "Failed to to create custom kernel library, error: ", error.localizedDescription.UTF8String);
|
||||
|
||||
id<MTLFunction> customFunction = [customKernelLibrary newFunctionWithName: @"add_arrays"];
|
||||
TORCH_CHECK(customFunction, "Failed to create function state object for the kernel");
|
||||
|
||||
@ -76,23 +76,4 @@ int32_t getGlobalIdxFromDevice(DeviceIndex device) {
|
||||
return device_global_idxs[device];
|
||||
}
|
||||
|
||||
// Check if a device can access the memory of a peer device directly.
|
||||
bool canDeviceAccessPeer(DeviceIndex device, DeviceIndex peer) {
|
||||
if (device == -1) {
|
||||
device = c10::xpu::current_device();
|
||||
}
|
||||
if (peer == -1) {
|
||||
peer = c10::xpu::current_device();
|
||||
}
|
||||
check_device_index(device);
|
||||
check_device_index(peer);
|
||||
// A device can always access itself
|
||||
if (device == peer) {
|
||||
return true;
|
||||
}
|
||||
return c10::xpu::get_raw_device(device).ext_oneapi_can_access_peer(
|
||||
c10::xpu::get_raw_device(peer),
|
||||
sycl::ext::oneapi::peer_access::access_supported);
|
||||
}
|
||||
|
||||
} // namespace at::xpu
|
||||
|
||||
@ -17,6 +17,4 @@ TORCH_XPU_API DeviceProp* getDeviceProperties(DeviceIndex device);
|
||||
|
||||
TORCH_XPU_API int32_t getGlobalIdxFromDevice(DeviceIndex device);
|
||||
|
||||
TORCH_XPU_API bool canDeviceAccessPeer(DeviceIndex device, DeviceIndex peer);
|
||||
|
||||
} // namespace at::xpu
|
||||
|
||||
@ -174,11 +174,11 @@ YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,0
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,0
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,0
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,20
|
||||
hf_Reformer,pass,25
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -170,15 +170,15 @@ YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,fail_to_run,0
|
||||
meta-llama/Llama-3.2-1B,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,fail_to_run,0
|
||||
google/gemma-2-2b,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,fail_to_run,0
|
||||
google/gemma-3-4b-it,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
@ -186,4 +186,4 @@ openai/whisper-tiny,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,fail_to_run,0
|
||||
Qwen/Qwen3-0.6B,fail_accuracy,0
|
||||
|
||||
|
@ -138,7 +138,7 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,27
|
||||
hf_BigBird,pass,25
|
||||
|
||||
|
||||
|
||||
@ -158,7 +158,7 @@ hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -138,7 +138,7 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,27
|
||||
hf_BigBird,pass,25
|
||||
|
||||
|
||||
|
||||
@ -158,7 +158,7 @@ hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -138,7 +138,7 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,27
|
||||
hf_BigBird,pass,25
|
||||
|
||||
|
||||
|
||||
@ -158,7 +158,7 @@ hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -174,11 +174,11 @@ YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,0
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,0
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,0
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,20
|
||||
hf_Reformer,pass,25
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -122,7 +122,7 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,27
|
||||
hf_BigBird,pass,25
|
||||
|
||||
|
||||
|
||||
@ -142,7 +142,7 @@ hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -174,11 +174,11 @@ YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,0
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,0
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,0
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,20
|
||||
hf_Reformer,pass,25
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -174,11 +174,11 @@ YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,0
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,0
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,0
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user