Compare commits

..

7 Commits

Author SHA1 Message Date
a3795cfaea lint 2025-09-16 11:00:20 -07:00
a9d5c00727 more memory? 2025-09-15 21:42:41 +00:00
c1102ca308 more memory? 2025-09-15 21:37:26 +00:00
7856f8d7f4 fixed i think 2025-09-13 13:52:39 +00:00
973c3b531a oops 2025-09-13 13:52:39 +00:00
d5a496e7f1 check 2025-09-13 13:52:39 +00:00
051e544ef6 [BE] Make PyObjectSlot use a global PyInterpreter
ghstack-source-id: 826f3f0c155cd0677776949b0c1c82395a95a1e8
Pull Request resolved: https://github.com/pytorch/pytorch/pull/158409

[BE] Remove pyinterpreter struct from pyobjslot

ghstack-source-id: 826f3f0c155cd0677776949b0c1c82395a95a1e8
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159511
2025-09-13 13:52:39 +00:00
713 changed files with 8749 additions and 25913 deletions

View File

@ -31,7 +31,8 @@ pip install -r /pytorch/requirements.txt
pip install auditwheel==6.2.0 wheel
if [ "$DESIRED_CUDA" = "cpu" ]; then
echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
else
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
export USE_SYSTEM_NCCL=1
@ -45,5 +46,6 @@ else
export USE_NVIDIA_PYPI_LIBS=1
fi
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
fi

View File

@ -317,7 +317,7 @@ if __name__ == "__main__":
).decode()
print("Building PyTorch wheel")
build_vars = ""
build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
if enable_cuda:
build_vars += "MAX_JOBS=5 "

View File

@ -262,10 +262,13 @@ case "$tag" in
TRITON_CPU=yes
;;
pytorch-linux-jammy-linter)
PYTHON_VERSION=3.10
# TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
# We will need to update mypy version eventually, but that's for another day. The task
# would be to upgrade mypy to 1.0.0 with Python 3.11
PYTHON_VERSION=3.9
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter)
PYTHON_VERSION=3.10
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
PYTHON_VERSION=3.9
CUDA_VERSION=12.8.1
;;
pytorch-linux-jammy-aarch64-py3.10-gcc11)

View File

@ -1 +1 @@
e0dda9059d082537cee36be6c5e4fe3b18c880c0
56392aa978594cc155fa8af48cd949f5b5f1823a

View File

@ -1,2 +1,2 @@
transformers==4.56.0
transformers==4.54.0
soxr==0.5.0

View File

@ -1 +1 @@
bbb06c0334a6772b92d24bde54956e675c8c6604
5ae38bdb0dc066c5823e34dc9797afb9de42c866

View File

@ -42,27 +42,22 @@ install_pip_dependencies() {
# A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current
# numba and scipy version used in PyTorch CI
conda_run pip uninstall -y numba scipy
# Yaspin is needed for running CI test (get_benchmark_analysis_data.py)
pip_install yaspin==3.1.0
popd
}
setup_executorch() {
pushd executorch
export PYTHON_EXECUTABLE=python
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON"
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
popd
}
if [ $# -eq 0 ]; then
clone_executorch
install_buck2
install_conda_dependencies
install_pip_dependencies
pushd executorch
setup_executorch
popd
else
"$@"
fi
clone_executorch
install_buck2
install_conda_dependencies
install_pip_dependencies
setup_executorch

View File

@ -93,9 +93,8 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
#Pinned versions:
#test that import:
mypy==1.16.0 ; platform_system != "Windows"
mypy==1.16.0
# Pin MyPy version because new errors are likely to appear with each release
# Skip on Windows as lots of type annotations are POSIX specific
#Description: linter
#Pinned versions: 1.16.0
#test that import: test_typing.py, test_type_hints.py

View File

@ -1,7 +1,7 @@
sphinx==5.3.0
#Description: This is used to generate PyTorch docs
#Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably

View File

@ -7,4 +7,4 @@ set -ex
SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh
USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh

View File

@ -41,6 +41,7 @@ def sample_vllm_test_library():
"pytest -v -s basic_correctness/test_cumem.py",
"pytest -v -s basic_correctness/test_basic_correctness.py",
"pytest -v -s basic_correctness/test_cpu_offload.py",
"VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
],
},
"vllm_basic_models_test": {
@ -67,12 +68,15 @@ def sample_vllm_test_library():
"-v",
"-s",
"entrypoints/llm",
"--ignore=entrypoints/llm/test_lazy_outlines.py",
"--ignore=entrypoints/llm/test_generate.py",
"--ignore=entrypoints/llm/test_generate_multiple_loras.py",
"--ignore=entrypoints/llm/test_collective_rpc.py",
]
),
"pytest -v -s entrypoints/llm/test_generate.py",
"pytest -v -s entrypoints/offline_mode",
"pytest -v -s entrypoints/llm/test_lazy_outlines.py",
"pytest -v -s entrypoints/llm/test_generate.py ",
"VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
],
},
"vllm_regression_test": {

View File

@ -0,0 +1,40 @@
#!/bin/bash
# This is where the local pytorch install in the docker image is located
pt_checkout="/var/lib/jenkins/workspace"
source "$pt_checkout/.ci/pytorch/common_utils.sh"
echo "functorch_doc_push_script.sh: Invoked with $*"
set -ex -o pipefail
version=${DOCS_VERSION:-nightly}
echo "version: $version"
# Build functorch docs
pushd $pt_checkout/functorch/docs
make html
popd
git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
pushd functorch_ghpages
if [ "$version" == "main" ]; then
version=nightly
fi
git rm -rf "$version" || true
mv "$pt_checkout/functorch/docs/build/html" "$version"
git add "$version" || true
git status
git config user.email "soumith+bot@pytorch.org"
git config user.name "pytorchbot"
# If there aren't changes, don't make a commit; push is no-op
git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
git status
if [[ "${WITH_PUSH:-}" == true ]]; then
git push -u origin gh-pages
fi
popd

View File

@ -1,25 +0,0 @@
From 6e08c9d08e9de59c7af28b720289debbbd384764 Mon Sep 17 00:00:00 2001
From: Michael Wang <13521008+isVoid@users.noreply.github.com>
Date: Tue, 1 Apr 2025 17:28:05 -0700
Subject: [PATCH] Avoid bumping certain driver API to avoid future breakage
(#185)
Co-authored-by: isVoid <isVoid@users.noreply.github.com>
---
numba_cuda/numba/cuda/cudadrv/driver.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 1641bf77..233e9ed7 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -365,6 +365,9 @@ def _find_api(self, fname):
else:
variants = ('_v2', '')
+ if fname in ("cuCtxGetDevice", "cuCtxSynchronize"):
+ return getattr(self.lib, fname)
+
for variant in variants:
try:
return getattr(self.lib, f'{fname}{variant}')

View File

@ -32,16 +32,6 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v
git config --global --add safe.directory /var/lib/jenkins/workspace
fi
# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
if [ -n "$NUMBA_CUDA_DIR" ]; then
NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
pushd "$NUMBA_CUDA_DIR"
patch -p4 <"$NUMBA_PATCH"
popd
fi
echo "Environment variables:"
env
@ -1550,10 +1540,14 @@ test_executorch() {
install_torchvision
install_torchaudio
INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh"
pushd /executorch
"${INSTALL_SCRIPT}" setup_executorch
export PYTHON_EXECUTABLE=python
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
# NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
# from the PR
bash .ci/scripts/setup-linux.sh --build-tool cmake
echo "Run ExecuTorch unit tests"
pytest -v -n auto
@ -1567,6 +1561,10 @@ test_executorch() {
popd
# Test torchgen generated code for Executorch.
echo "Testing ExecuTorch op registration"
"$BUILD_BIN_DIR"/test_edge_op_registration
assert_git_not_dirty
}
@ -1574,7 +1572,6 @@ test_linux_aarch64() {
python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
# Dynamo tests

View File

@ -137,7 +137,7 @@ sccache --show-stats
python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
(
if "%BUILD_ENVIRONMENT%"=="" (
echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash.
echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
) else (
copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%"

View File

@ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" (
) else (
set CONDA_PARENT_DIR=C:\Jenkins
)
set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3
:: Be conservative here when rolling out the new AMI with conda. This will try
:: to install conda as before if it couldn't find the conda installation. This
:: can be removed eventually after we gain enough confidence in the AMI
if not exist %CONDA_ROOT_DIR% (
if not exist %CONDA_PARENT_DIR%\Miniconda3 (
set INSTALL_FRESH_CONDA=1
)
@ -17,14 +17,10 @@ if "%INSTALL_FRESH_CONDA%"=="1" (
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
%TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR%
%TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
)
:: Activate conda so that we can use its commands, i.e. conda, python, pip
call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%
:: Activate conda so that we can use its commands, i.e. conda, python, pip
call conda activate py_tmp
call pip install -r .ci/docker/requirements-ci.txt
call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3

View File

@ -14,7 +14,7 @@ if not errorlevel 0 exit /b
:: build\torch. Rather than changing all these references, making a copy of torch folder
:: from conda to the current workspace is easier. The workspace will be cleaned up after
:: the job anyway
xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
pushd .
if "%VC_VERSION%" == "" (

View File

@ -38,14 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
fi
# TODO: Move both of them to Windows AMI
python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
# Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments
# pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node'
# scipy from 1.6.3 to 1.10
# expecttest from 0.1.3 to 0.3.0
# xdoctest from 1.0.2 to 1.3.0
python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.3.0" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" "boto3==1.35.42"
python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
# Install Z3 optional dependency for Windows builds.
python -m pip install z3-solver==4.15.1.0
@ -59,6 +52,9 @@ python -m pip install parameterized==0.8.1
# Install pulp for testing ilps under torch\distributed\_tools
python -m pip install pulp==2.9.0
# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
python -m pip install expecttest==0.3.0
run_tests() {
# Run nvidia-smi if available
for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do

View File

@ -21,7 +21,6 @@ self-hosted-runner:
- linux.arm64.2xlarge.ephemeral
- linux.arm64.m7g.4xlarge
- linux.arm64.m7g.4xlarge.ephemeral
- linux.arm64.r7g.12xlarge.memory
- linux.4xlarge.nvidia.gpu
- linux.8xlarge.nvidia.gpu
- linux.16xlarge.nvidia.gpu

View File

@ -264,7 +264,7 @@ def unzip_artifact_and_replace_files() -> None:
change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py")
for file in Path(f"artifacts/dist/{old_stem}").glob(
"*.dist-info/*",
"*.dist-info/**",
):
change_content_to_new_version(file)

View File

@ -6,12 +6,6 @@ inputs:
cuda-version:
description: which cuda version to install, 'cpu' for none
required: true
python-version:
required: false
type: string
default: "3.10"
description: |
The python version to be used. Will be 3.10 by default
runs:
using: composite
@ -44,24 +38,18 @@ runs:
CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat"
{
echo "CONDA=${CONDA}";
echo "CONDA_RUN=${CONDA} run --no-capture-output";
echo "CONDA_BUILD=${CONDA} run conda-build";
echo "CONDA_INSTALL=${CONDA} install";
} >> "${GITHUB_ENV}"
- name: Setup Python3
env:
PYTHON_VERSION: ${{ inputs.python-version }}
shell: bash
run: |
set +e
set -x
# Create new py_tmp env with python-version
${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp
PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
PYTHON3=$(${CONDA_RUN} which python3)
EXIT_CODE=$?
if [[ "${EXIT_CODE}" == "0" ]]; then
@ -74,7 +62,7 @@ runs:
# installation, which is Python 3 based. Its Python is default to Python 3. Further, there
# is also the Miniconda installation that is Python 2 based, and both can be installed if
# needed. In both cases, Python binary is just called python
PYTHON=$(${CONDA_RUN} -n py_tmp which python)
PYTHON=$(${CONDA_RUN} which python)
EXIT_CODE=$?
if [[ "${EXIT_CODE}" == "0" ]]; then

View File

@ -1 +1 @@
87ff22e49ed0e92576c4935ccb8c143daac4a3cd
caba63f0fa29ef9e3d566699f32f11c07c8bda4e

View File

@ -1 +1 @@
090197034faf3b193c4467cedeb9281e3078892d
f510715882304796a96e33028b4f6de1b026c2c7

View File

@ -1 +1 @@
c77852e117bdf056c8e9a087e51d6f65cf6ba53d
6c5478ff7c3d50dd1e3047d72ec5909bea474073

View File

@ -82,10 +82,16 @@ RUN if command -v apt-get >/dev/null; then \
apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim; \
else \
dnf install -y git curl wget sudo; \
dnf install -y git curl wget sudo vim; \
fi \
&& python3 --version && python3 -m pip --version
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# Install uv for faster pip installs if not existed
RUN --mount=type=cache,target=/root/.cache/uv \
if ! python3 -m uv --version >/dev/null 2>&1; then \
@ -214,16 +220,11 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..."; \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
SCCACHE_ARCHIVE="sccache-v0.8.1-aarch64-unknown-linux-musl"; \
else \
SCCACHE_ARCHIVE="sccache-v0.8.1-x86_64-unknown-linux-musl"; \
fi; \
curl -L -o sccache.tar.gz "https://github.com/mozilla/sccache/releases/download/v0.8.1/${SCCACHE_ARCHIVE}.tar.gz" \
echo "Installing sccache..." \
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
&& tar -xzf sccache.tar.gz \
&& sudo mv "${SCCACHE_ARCHIVE}"/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz "${SCCACHE_ARCHIVE}" \
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
@ -284,7 +285,7 @@ RUN if command -v apt-get >/dev/null; then \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
else \
dnf install -y git curl wget sudo; \
dnf install -y git curl wget sudo vim; \
fi \
&& python3 --version && python3 -m pip --version
@ -297,6 +298,12 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
echo "[INFO] Showing torch_build_versions.txt content:" && \
cat torch_build_versions.txt
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# Install uv for faster pip installs if not existed
RUN --mount=type=cache,target=/root/.cache/uv \
if ! python3 -m uv --version > /dev/null 2>&1; then \

3
.github/labeler.yml vendored
View File

@ -130,6 +130,3 @@
- torch/csrc/inductor/aoti_include/**
- torchgen/aoti/**
- torchgen/gen_aoti_c_shim.py
"ciflow/vllm":
- .github/ci_commit_pins/vllm.txt

View File

@ -135,7 +135,7 @@ ROCM_SMOKE_WORKFLOWS = [
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
OperatingSystem.LINUX,
arches=["6.4"],
python_versions=["3.10"],
python_versions=["3.9"],
),
ciflow_config=CIFlowConfig(
labels={

View File

@ -84,9 +84,6 @@ repackage_wheel() {
rm -rf $package
}
# Require to re-package the wheel
${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1
pushd externals/vllm/wheels
for package in xformers flashinfer-python vllm; do
repackage_wheel $package

View File

@ -187,6 +187,8 @@ jobs:
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
with:
driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }}
if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}
- name: configure aws credentials

View File

@ -75,6 +75,10 @@ jobs:
runner: ${{ inputs.runner_prefix }}linux.2xlarge
# It takes less than 30m to finish python docs unless there are issues
timeout-minutes: 30
- docs_type: functorch
runner: ${{ inputs.runner_prefix }}linux.2xlarge
# It takes less than 15m to finish functorch docs unless there are issues
timeout-minutes: 15
# Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
# The current name requires updating the database last docs push query from test-infra every time the matrix is updated
name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
@ -207,6 +211,16 @@ jobs:
path: cppdocs/
s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs
- name: Upload functorch Docs Preview
uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }}
with:
retention-days: 14
s3-bucket: doc-previews
if-no-files-found: error
path: functorch_ghpages/nightly/
s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always()

View File

@ -2,12 +2,6 @@ name: Get Changed Files
on:
workflow_call:
inputs:
all_files:
description: "Whether to return all files instead of just changed files"
required: false
type: boolean
default: false
outputs:
changed-files:
description: "List of changed files (space-separated) or '*' if not in a PR"
@ -32,23 +26,17 @@ jobs:
# Get the PR number from the github context
PR_NUMBER="${{ github.event.number }}"
# Check if all_files is requested
if [ "${{ inputs.all_files }}" = "true" ]; then
echo "all_files input is true, returning all files"
echo "changed-files=*" >> "$GITHUB_OUTPUT"
else
# Use gh CLI to get changed files in the PR with explicit repo
CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
# Use gh CLI to get changed files in the PR with explicit repo
CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
if [ -z "$CHANGED_FILES" ]; then
echo "No changed files found, setting to '*'"
CHANGED_FILES="*"
fi
echo "Changed files: $CHANGED_FILES"
echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
if [ -z "$CHANGED_FILES" ]; then
echo "No changed files found, setting to '*'"
CHANGED_FILES="*"
fi
echo "Changed files: $CHANGED_FILES"
echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
else
echo "Not in PR context, setting changed files to '*'"
echo "changed-files=*" >> "$GITHUB_OUTPUT"

View File

@ -169,7 +169,7 @@ jobs:
id: install-nvidia-driver
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
with:
driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }}
driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}
- name: Setup GPU_FLAG for docker run

View File

@ -62,11 +62,6 @@ on:
required: false
type: number
default: 1
secrets:
HUGGING_FACE_HUB_TOKEN:
required: false
description: |
HF Auth token to avoid rate limits when downloading models or datasets from hub
env:
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
@ -81,9 +76,10 @@ jobs:
strategy:
matrix: ${{ fromJSON(inputs.test-matrix) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
runs-on: ${{ matrix.runner }}
steps:
# [see note: pytorch repo ref]
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
with:
@ -135,9 +131,6 @@ jobs:
- name: Start monitoring script
id: monitor-script
if: ${{ !inputs.disable-monitor }}
shell: bash
continue-on-error: true
env:
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
@ -145,6 +138,9 @@ jobs:
WORKFLOW_RUN_ID: ${{github.run_id}}
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
if: ${{ !inputs.disable-monitor }}
shell: bash
continue-on-error: true
run: |
python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
@ -182,12 +178,6 @@ jobs:
run: |
echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
- name: Preserve github env variables for use in docker
shell: bash
run: |
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Test
id: test
env:
@ -203,22 +193,20 @@ jobs:
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
BRANCH: ${{ steps.parse-ref.outputs.branch }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }}
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }}
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
DOCKER_IMAGE: ${{ inputs.docker-image }}
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
run: |
set -x
@ -248,7 +236,6 @@ jobs:
-e GITHUB_RUN_ATTEMPT \
-e JOB_ID \
-e JOB_NAME \
-e BASE_SHA \
-e BRANCH \
-e SHA1 \
-e AWS_DEFAULT_REGION \
@ -266,12 +253,10 @@ jobs:
-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
-e TESTS_TO_INCLUDE \
-e HUGGING_FACE_HUB_TOKEN \
-e DASHBOARD_TAG \
--env-file="${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}" \
--ulimit stack=10485760:83886080 \
--ulimit core=0 \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--shm-size="8g" \

View File

@ -151,7 +151,7 @@ jobs:
BUILD_WHEEL: 1
MAX_JOBS: 8
CUDA_VERSION: ${{ inputs.cuda-version }}
PYTHON_VERSION: "3.10"
PYTHON_VERSION: "3.9"
SCCACHE_BUCKET: "ossci-compiler-cache"
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
SCCACHE_REGION: us-east-1

View File

@ -184,7 +184,7 @@ jobs:
env:
USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
INSTALL_WINDOWS_SDK: 1
PYTHON_VERSION: "3.10"
PYTHON_VERSION: 3.9
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}

View File

@ -12,9 +12,6 @@ on:
paths:
- .github/workflows/build-vllm-wheel.yml
- .github/ci_commit_pins/vllm.txt
schedule:
# every morning at 01:30PM UTC, 9:30AM EST, 6:30AM PST
- cron: 30 13 * * *
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -27,33 +24,21 @@ jobs:
fail-fast: false
matrix:
python-version: [ '3.12' ]
# TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
# TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554
device: [ 'cu128', 'cu129' ]
runner: [ 'linux.12xlarge.memory' ]
include:
- platform: manylinux_2_28_x86_64
device: cu128
- device: cu128
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8'
runner: linux.12xlarge.memory
- platform: manylinux_2_28_x86_64
device: cu129
- device: cu129
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
runner: linux.12xlarge.memory
- platform: manylinux_2_28_aarch64
device: cu128
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
runner: linux.arm64.r7g.12xlarge.memory
- platform: manylinux_2_28_aarch64
device: cu129
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9'
runner: linux.arm64.r7g.12xlarge.memory
name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
name: "Build ${{ matrix.device }} vLLM wheel"
runs-on: ${{ matrix.runner }}
timeout-minutes: 480
env:
PY_VERS: ${{ matrix.python-version }}
MANYLINUX_IMAGE: ${{ matrix.manylinux-image }}
PLATFORM: ${{ matrix.platform }}
PLATFORM: 'manylinux_2_28_x86_64'
BUILD_DEVICE: ${{ matrix.device }}
steps:
- name: Setup SSH (Click me for login details)
@ -151,7 +136,7 @@ jobs:
- uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
with:
name: vllm-wheel-${{ matrix.device }}-${{ matrix.platform }}-${{ matrix.python-version }}
name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }}
if-no-files-found: error
path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl
@ -161,29 +146,27 @@ jobs:
# Copied from build-triton-wheel workflow (mostly)
upload-wheel:
name: "Upload ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
name: "Upload ${{ matrix.device }} vLLM wheel"
needs:
- build-wheel
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
device: [ 'cu128', 'cu129' ]
env:
PLATFORM: ${{ matrix.platform }}
BUILD_DEVICE: ${{ matrix.device }}
permissions:
id-token: write
contents: read
container:
image: continuumio/miniconda3:4.12.0
environment: ${{ ((github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && 'nightly-wheel-upload' || '' }}
environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }}
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Configure AWS credentials(PyTorch account) for main
if: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
@ -207,15 +190,15 @@ jobs:
run: |
set -eux
mkdir -p "${RUNNER_TEMP}/artifacts/"
mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-"${PLATFORM}"-*/* "${RUNNER_TEMP}/artifacts/"
mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/"
- name: Set DRY_RUN
if: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
shell: bash
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
shell: bash
run: |

View File

@ -70,8 +70,9 @@ jobs:
pytorch-linux-jammy-py3-clang18-asan,
pytorch-linux-jammy-py3-clang12-onnx,
pytorch-linux-jammy-linter,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter,
pytorch-linux-jammy-py3-clang12-executorch,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
# Executorch pin needs update
# pytorch-linux-jammy-py3-clang12-executorch,
pytorch-linux-jammy-py3.12-triton-cpu,
pytorch-linux-noble-riscv64-py3.12-gcc14
]

View File

@ -44,7 +44,7 @@ jobs:
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
manywheel-py3_10-rocm6_4-build:
manywheel-py3_9-rocm6_4-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
@ -58,16 +58,16 @@ jobs:
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.10"
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-rocm6_4
build_name: manywheel-py3_9-rocm6_4
build_environment: linux-binary-manywheel-rocm
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-rocm6_4-test: # Testing
manywheel-py3_9-rocm6_4-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_10-rocm6_4-build
- manywheel-py3_9-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
@ -82,14 +82,14 @@ jobs:
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.10"
DESIRED_PYTHON: "3.9"
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: manywheel-py3_10-rocm6_4
name: manywheel-py3_9-rocm6_4
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: actions/checkout@v4

View File

@ -43,11 +43,6 @@ on:
required: false
type: boolean
default: false
freezing:
description: Run freezing?
required: false
type: boolean
default: true
benchmark_configs:
description: The list of configs used the benchmark
required: false
@ -107,7 +102,7 @@ jobs:
if: github.event.schedule == '0 7 * * *'
with:
build-environment: linux-jammy-py3.10-gcc11-build
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
timeout-minutes: 720
@ -121,9 +116,10 @@ jobs:
name: inductor-test
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
if: github.event_name == 'workflow_dispatch'
with:
build-environment: linux-jammy-py3.10-gcc11-build
dashboard-tag: training-${{ inputs.training || 'false' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'true' }}-aotinductor-${{ inputs.aotinductor || 'true' }}-freezing-${{ inputs.freezing || 'true' }}
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
timeout-minutes: 720

View File

@ -39,7 +39,7 @@ jobs:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0;8.6'
cuda-arch-list: '8.6'
test-matrix: |
{ include: [
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
@ -62,7 +62,7 @@ jobs:
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },

View File

@ -31,8 +31,6 @@ jobs:
if: github.repository_owner == 'pytorch'
name: Get changed files
uses: ./.github/workflows/_get-changed-files.yml
with:
all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') }}
lintrunner-clang:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -55,7 +53,7 @@ jobs:
with:
timeout: 120
runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter
docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
# NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
# to run git rev-parse HEAD~:.ci/docker when a new image is needed
fetch-depth: 0
@ -266,10 +264,10 @@ jobs:
with:
submodules: false
fetch-depth: 1
- name: Setup Python 3.10
- name: Setup Python 3.9
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3.10'
python-version: '3.9'
architecture: x64
cache: pip
- name: Install dependencies

View File

@ -127,6 +127,8 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
# More memory is needed to build with asan
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang18-asan
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
@ -316,6 +318,32 @@ jobs:
]}
secrets: inherit
linux-jammy-py3-clang12-executorch-build:
if: false # Docker build needs pin update
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3-clang12-executorch
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
test-matrix: |
{ include: [
{ config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
]}
secrets: inherit
linux-jammy-py3-clang12-executorch-test:
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-py3-clang12-executorch-build
if: false # Has been broken for a while
with:
build-environment: linux-jammy-py3-clang12-executorch
docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
name: cuda12.8-py3.10-gcc9-sm75
uses: ./.github/workflows/_linux-build.yml

View File

@ -140,6 +140,8 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
# More memory is needed to build with asan
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang18-asan
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan

View File

@ -259,27 +259,3 @@ jobs:
docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-py3-clang12-executorch-build:
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3-clang12-executorch
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
test-matrix: |
{ include: [
{ config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
]}
secrets: inherit
linux-jammy-py3-clang12-executorch-test:
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-py3-clang12-executorch-build
with:
build-environment: linux-jammy-py3-clang12-executorch
docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
secrets: inherit

View File

@ -53,3 +53,27 @@ jobs:
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-py3_9-clang9-xla-build:
name: linux-jammy-py3_9-clang9-xla
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-clang9-xla
docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
test-matrix: |
{ include: [
{ config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
]}
secrets: inherit
linux-jammy-py3_9-clang9-xla-test:
name: linux-jammy-py3_9-clang9-xla
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-py3_9-clang9-xla-build
with:
build-environment: linux-jammy-py3.9-clang9-xla
docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
secrets: inherit

View File

@ -36,8 +36,6 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
# When building vLLM, uv doesn't like that we rename wheel without changing the wheel metadata
allow-reuse-old-whl: false
build-additional-packages: "vision audio"
build-external-packages: "vllm"
build-environment: linux-jammy-cuda12.8-py3.12-gcc11

5
.gitignore vendored
View File

@ -259,9 +259,6 @@ gen
.pytest_cache
aten/build/*
# Linker scripts for prioritized text optimization
cmake/linker_script.ld
# Bram
plsdontbreak
@ -392,5 +389,3 @@ android/pytorch_android_torchvision/.cxx
# Claude Code local configuration
CLAUDE.local.md
/test_*.py
/debug_*.py

View File

@ -123,7 +123,6 @@ is_formatter = true
code = 'MYPY'
include_patterns = [
'setup.py',
'functorch/dim/**/*.py',
'torch/**/*.py',
'torch/**/*.pyi',
'caffe2/**/*.py',
@ -196,7 +195,6 @@ exclude_patterns = [
'tools/test/gen_operators_yaml_test.py',
'tools/test/gen_oplist_test.py',
'tools/test/test_selective_build.py',
'tools/experimental/dynamic_shapes/torchfuzz/**',
]
command = [
'python3',
@ -966,6 +964,7 @@ exclude_patterns = [
'test/jit/**', # should be run through test/test_jit.py
'test/ao/sparsity/**', # should be run through test/test_ao_sparsity.py
'test/fx/**', # should be run through test/test_fx.py
'test/bottleneck_test/**', # excluded by test/run_test.py
'test/package/**', # excluded by test/run_test.py
'test/distributed/argparse_util_test.py',
'test/distributed/bin/test_script.py',
@ -1411,6 +1410,8 @@ exclude_patterns = [
'torch/utils/benchmark/utils/timer.py',
'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
'torch/utils/bottleneck/__init__.py',
'torch/utils/bottleneck/__main__.py',
'torch/utils/bundled_inputs.py',
'torch/utils/checkpoint.py',
'torch/utils/collect_env.py',

View File

@ -1,4 +1,5 @@
cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
# cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW)
# Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this
# sometimes makes XCode C compiler gets detected as "Clang", even when the C++
@ -379,13 +380,6 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
OFF "USE_CUDA" OFF)
cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
"CPU_AARCH64" OFF)
# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le.
set(USE_PRIORITIZED_TEXT_DEFAULT OFF)
if(LINUX AND CPU_AARCH64)
set(USE_PRIORITIZED_TEXT_DEFAULT ON)
endif()
cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld."
"${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF)
option(USE_MIMALLOC "Use mimalloc" OFF)
# Enable third party mimalloc library to improve memory allocation performance
@ -663,11 +657,6 @@ endif(MSVC)
string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
# Set linker max-page-size to 64KiB on AArch64 Linux
if(LINUX AND CPU_AARCH64)
add_link_options_if_supported("-z,max-page-size=0x10000")
endif()
# Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
# applicable to mobile are disabled by this variable. Setting
# `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
@ -885,7 +874,7 @@ cmake_dependent_option(
"Whether to build the flash_attention kernel for scaled dot product attention.\
Will be disabled if not supported by the platform"
ON
"(USE_CUDA AND NOT MSVC) OR USE_ROCM"
"USE_CUDA OR USE_ROCM"
OFF)
cmake_dependent_option(
@ -902,7 +891,7 @@ IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
endif()
# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100.
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a")
set(USE_FBGEMM_GENAI ON)
endif()
@ -1432,57 +1421,3 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
DESTINATION "${CMAKE_INSTALL_BINDIR}")
endif()
if(USE_PRIORITIZED_TEXT_FOR_LD)
add_compile_options(
$<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
$<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
)
set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
add_custom_command(
OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
COMMENT "Generating prioritized text linker files"
VERBATIM
)
add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
if(BUILD_PYTHON)
set(LINKER_OPT_TARGETS torch_python)
endif()
if(NOT BUILD_LIBTORCHLESS)
list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
if(USE_CUDA)
list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
endif()
if(USE_XPU)
list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
endif()
if(USE_ROCM)
list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
endif()
endif()
foreach(tgt IN LISTS LINKER_OPT_TARGETS)
if(TARGET ${tgt})
add_dependencies("${tgt}" generate_linker_script)
target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
else()
message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
endif()
endforeach()
else()
if(LINUX AND CPU_AARCH64)
message(WARNING [[
It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
]])
endif()
endif()

View File

@ -180,7 +180,7 @@ void Context::setUserEnabledNNPACK(bool e) {
}
bool Context::allowTF32CuDNN(const std::string& op) const {
if (op.empty()){
if (op.size() == 0){
bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32";
bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32";
TORCH_CHECK(
@ -281,6 +281,9 @@ bool Context::userEnabledOverrideableSDP() const {
static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
#ifdef USE_ROCM
static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32";
#endif
bool Context::checkCuBLASConfigDeterministic() {
// If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
@ -340,6 +343,12 @@ void Context::setImmediateMiopen(bool b) {
}
bool Context::allowTF32CuBLAS() const {
#ifdef USE_ROCM
const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
if (allow_tf32 != true) {
return false;
}
#endif
bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32";
TORCH_CHECK(
@ -353,6 +362,14 @@ bool Context::allowTF32CuBLAS() const {
}
void Context::setAllowTF32CuBLAS(bool b) {
#ifdef USE_ROCM
const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
if (allow_tf32 != true) {
C10_LOG_FIRST_N(INFO, 10) << "torch.backends.cuda.matmul.allow_tf32 is not supported on ROCm by default. "
<< "Please set environment variable HIPBLASLT_ALLOW_TF32=1 to enable it.";
return;
}
#endif
float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee");
}
@ -426,7 +443,7 @@ void Context::setFloat32Precision(const std::string& backend, const std::string&
std::string msg;
auto iterp = _fp32_precisions.find(backend);
TORCH_CHECK(iterp != _fp32_precisions.end());
for (const auto& p : iterp->second) {
for (auto p : iterp->second) {
msg += p;
msg += " ";
}

View File

@ -65,24 +65,14 @@ DLDataType getDLDataType(const Tensor& t) {
break;
// TODO(#146647): use macro here instead of spelling out each shell dtype
case ScalarType::Float8_e5m2:
dtype.code = DLDataTypeCode::kDLFloat8_e5m2;
break;
case ScalarType::Float8_e5m2fnuz:
dtype.code = DLDataTypeCode::kDLFloat8_e5m2fnuz;
break;
case ScalarType::Float8_e4m3fn:
dtype.code = DLDataTypeCode::kDLFloat8_e4m3fn;
break;
case ScalarType::Float8_e4m3fnuz:
dtype.code = DLDataTypeCode::kDLFloat8_e4m3fnuz;
break;
case ScalarType::Float8_e8m0fnu:
dtype.code = DLDataTypeCode::kDLFloat8_e8m0fnu;
TORCH_CHECK_BUFFER(false, "float8 types are not supported by dlpack");
break;
case ScalarType::Float4_e2m1fn_x2:
dtype.code = DLDataTypeCode::kDLFloat4_e2m1fn;
dtype.lanes = 2;
dtype.bits = 4;
TORCH_CHECK_BUFFER(false, "float4 types are not supported by dlpack");
break;
case ScalarType::QInt8:
case ScalarType::QUInt8:
@ -187,11 +177,7 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat
ScalarType toScalarType(const DLDataType& dtype) {
ScalarType stype = ScalarType::Undefined;
if (dtype.code != DLDataTypeCode::kDLFloat4_e2m1fn) {
TORCH_CHECK_BUFFER(
dtype.lanes == 1,
"ATen does not support lanes != 1 for dtype code", std::to_string(dtype.code));
}
TORCH_CHECK_BUFFER(dtype.lanes == 1, "ATen does not support lanes != 1");
switch (dtype.code) {
case DLDataTypeCode::kDLUInt:
switch (dtype.bits) {
@ -283,73 +269,6 @@ ScalarType toScalarType(const DLDataType& dtype) {
false, "Unsupported kDLBool bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e5m2:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e5m2;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e5m2 bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e5m2fnuz:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e5m2fnuz;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e5m2fnuz bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e4m3fn:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e4m3fn;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e4m3fn bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e4m3fnuz:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e4m3fnuz;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e4m3fnuz bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e8m0fnu:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e8m0fnu;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e8m0fnu bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat4_e2m1fn:
switch (dtype.bits) {
case 4:
switch (dtype.lanes) {
case 2:
stype = ScalarType::Float4_e2m1fn_x2;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat4_e2m1fn lanes ", std::to_string(dtype.lanes));
}
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat4_e2m1fn bits ", std::to_string(dtype.bits));
}
break;
default:
TORCH_CHECK_BUFFER(false, "Unsupported code ", std::to_string(dtype.code));
}
@ -401,13 +320,30 @@ T* toDLPackImpl(const Tensor& src) {
// The following code detects whether the src follows
// a continuous pattern. If the src follows such pattern (common-case)
// then we do not need to normalize the strides.
bool need_normalize_strides = src.dim() == 1 && src.size(0) == 1 && src.stride(0) != 1;
bool need_normalize_strides = false;
int64_t expected_stride = 1;
for (int i = src.dim() - 1; i >= 0; i--) {
// detect if we do not meet continuous pattern
// and the size is 1, so there is opportunity to normalize
if (src.stride(i) != expected_stride && src.size(i) == 1) {
need_normalize_strides = true;
break;
}
expected_stride *= src.size(i);
}
// less common case, try normalizing the strides
if (need_normalize_strides) {
// create a new tensor with possibly normalized strides
// gh-83069
auto shape = src.sizes();
view = src.as_strided(shape, {1}, src.storage_offset());
auto strides = src.strides().vec();
for (int i = 0; i < src.dim(); i++) {
if (shape[i] < 2) {
strides[i] = 1;
}
}
view = src.as_strided(shape, strides, src.storage_offset());
}
ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
@ -418,8 +354,8 @@ T* toDLPackImpl(const Tensor& src) {
atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device());
atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim());
atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(view.sizes().data());
atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(view.strides().data());
atDLMTensor->tensor.dl_tensor.shape = view.sizes().data();
atDLMTensor->tensor.dl_tensor.strides = view.strides().data();
atDLMTensor->tensor.dl_tensor.byte_offset = 0;
fillVersion(&atDLMTensor->tensor);

View File

@ -102,7 +102,7 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
// SparseTensorImpl has no storage, so we cannot query its nbytes.
// (original_storage_size is only used for storage resizing in fsdp anyway, which does not apply to sparse)
// Same for XLA
if (base.unsafeGetTensorImpl()->has_storage() && data_ptr().device().type() != c10::DeviceType::XLA) {
if (base.unsafeGetTensorImpl()->has_storage() && base.device().type() != c10::DeviceType::XLA) {
original_storage_size_ = base.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes();
} else {
original_storage_size_ = -1;

View File

@ -133,7 +133,7 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const
: c10::TensorImpl(
c10::DispatchKeySet(DispatchKey::Functionalize),
view_value.dtype(),
base->storage().data_ptr().device()
view_value.device()
),
value_(view_value),
is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
@ -485,10 +485,7 @@ void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptr<TensorI
c10::Device FunctionalTensorWrapper::device_custom() const {
// The storage pointer already uses the underlying tensor custom device (if
// applicable) to extract the device. So, we dont have to recurse again by
// doing value_.unsafeGetTensorImpl()->device().
return storage().data_ptr().device();
return value_.unsafeGetTensorImpl()->device();
}
at::IntArrayRef FunctionalTensorWrapper::sizes_custom() const {
return value_.unsafeGetTensorImpl()->sizes();

View File

@ -1637,7 +1637,9 @@ bool gemm_and_bias(
if (activation == GEMMAndBiasActivationEpilogue::RELU) {
epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
} else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
#if CUDA_VERSION >= 11040 || defined(USE_ROCM)
epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
#endif
}
if (bias != nullptr) {
@ -1929,6 +1931,7 @@ void scaled_gemm(
bool use_fast_accum) {
// Note: see `cublasCommonArgs` for various non-intuitive manupulations
// of input arguments to this function.
#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
const auto computeType = CUBLAS_COMPUTE_32F;
const auto scaleType = CUDA_R_32F;
const float alpha_val = 1.0;
@ -1951,8 +1954,8 @@ void scaled_gemm(
#if ROCM_VERSION >= 70000
if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) {
// TODO: add constraints based on hipblaslt internals
TORCH_CHECK((m % 16 == 0) && (n % 16 == 0) && (k % 128 == 0),
"M, N must be multiples of 16 and K should be multiple of 128 for MX format. "
TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
"Matrix dimensions must be multiples of 32 for MX format. "
"Got m=", m, ", n=", n, ", k=", k);
}
#endif
@ -2130,6 +2133,8 @@ void scaled_gemm(
" scaleType ",
scaleType);
return;
#endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM)
TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above");
}
void int8_gemm(

View File

@ -266,14 +266,11 @@ CUDAGeneratorImpl::CUDAGeneratorImpl(
* See Note [Acquire lock when using random generators]
*/
void CUDAGeneratorImpl::set_current_seed(uint64_t seed) {
if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
state_->seed_ = seed;
state_->philox_offset_per_thread_ = 0;
no_reset_rnn_state_.clear();
} else {
TORCH_CHECK(state_->seed_ == seed, "CUDAGeneratorImpl::set_current_seed can be called during stream capture only if new seed is the same as the original seed.");
// no-op case
}
at::cuda::assertNotCapturing(
"Cannot call CUDAGeneratorImpl::set_current_seed");
state_->seed_ = seed;
state_->philox_offset_per_thread_ = 0;
no_reset_rnn_state_.clear();
}
/**
@ -302,6 +299,9 @@ uint64_t CUDAGeneratorImpl::get_offset() const {
* Gets the current seed of CUDAGeneratorImpl.
*/
uint64_t CUDAGeneratorImpl::current_seed() const {
// Debatable if current_seed() should be allowed in captured regions.
// Conservatively disallow it for now.
at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::current_seed");
return state_->seed_;
}
@ -346,6 +346,8 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
* and size of the internal state.
*/
void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
at::cuda::assertNotCapturing(
"Please ensure to utilize the CUDAGeneratorImpl::set_state_index method during capturing.");
static const size_t seed_size = sizeof(uint64_t);
static const size_t offset_size = sizeof(int64_t);
static const size_t total_size = seed_size + offset_size;
@ -400,27 +402,15 @@ c10::intrusive_ptr<c10::GeneratorImpl> CUDAGeneratorImpl::graphsafe_get_state()
*/
void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
// see Note [Why enforce RNG offset % 4 == 0?]
// Note: If you use CUDNN RNN's, calling
// set_philox_offset_per_thread instead of set_offset will cause the
// cudnn RNN rng state to become stale.
TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4");
if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
state_->philox_offset_per_thread_ = offset;
} else {
state_->offset_intragraph_ = offset;
}
state_->philox_offset_per_thread_ = offset;
}
/**
* Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl.
*/
uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const {
if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
return state_->philox_offset_per_thread_;
} else {
return state_->offset_intragraph_;
}
return state_->philox_offset_per_thread_;
}
/**

View File

@ -19,7 +19,7 @@
#define DLPACK_MAJOR_VERSION 1
/*! \brief The current minor version of dlpack */
#define DLPACK_MINOR_VERSION 1
#define DLPACK_MINOR_VERSION 0
/*! \brief DLPACK_DLL prefix for windows */
#ifdef _WIN32
@ -32,7 +32,9 @@
#define DLPACK_DLL
#endif
// NOLINTNEXTLINE(modernize-deprecated-headers)
#include <stdint.h>
// NOLINTNEXTLINE(modernize-deprecated-headers)
#include <stddef.h>
#ifdef __cplusplus
@ -157,26 +159,6 @@ typedef enum {
kDLComplex = 5U,
/*! \brief boolean */
kDLBool = 6U,
/*! \brief FP8 data types */
kDLFloat8_e3m4 = 7U,
kDLFloat8_e4m3 = 8U,
kDLFloat8_e4m3b11fnuz = 9U,
kDLFloat8_e4m3fn = 10U,
kDLFloat8_e4m3fnuz = 11U,
kDLFloat8_e5m2 = 12U,
kDLFloat8_e5m2fnuz = 13U,
kDLFloat8_e8m0fnu = 14U,
/*! \brief FP6 data types
* Setting bits != 6 is currently unspecified, and the producer must ensure it is set
* while the consumer must stop importing if the value is unexpected.
*/
kDLFloat6_e2m3fn = 15U,
kDLFloat6_e3m2fn = 16U,
/*! \brief FP4 data types
* Setting bits != 4 is currently unspecified, and the producer must ensure it is set
* while the consumer must stop importing if the value is unexpected.
*/
kDLFloat4_e2m1fn = 17U,
} DLDataTypeCode;
/*!
@ -190,12 +172,6 @@ typedef enum {
* - int8: type_code = 0, bits = 8, lanes = 1
* - std::complex<float>: type_code = 5, bits = 64, lanes = 1
* - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
* - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory)
* - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory)
* - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory)
*
* When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e.,
* for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element.
*/
typedef struct {
/*!
@ -253,12 +229,12 @@ typedef struct {
/*! \brief The data type of the pointer*/
DLDataType dtype;
/*! \brief The shape of the tensor */
int64_t* shape;
const int64_t* shape;
/*!
* \brief strides of the tensor (in number of elements, not bytes)
* can be NULL, indicating tensor is compact and row-majored.
*/
int64_t* strides;
const int64_t* strides;
/*! \brief The offset in bytes to the beginning pointer to data */
uint64_t byte_offset;
} DLTensor;
@ -293,7 +269,7 @@ typedef struct DLManagedTensor {
void (*deleter)(struct DLManagedTensor * self);
} DLManagedTensor;
// bit masks used in the DLManagedTensorVersioned
// bit masks used in in the DLManagedTensorVersioned
/*! \brief bit mask to indicate that the tensor is read only. */
#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
@ -306,14 +282,6 @@ typedef struct DLManagedTensor {
*/
#define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL)
/*
* \brief bit mask to indicate that whether a sub-byte type is packed or padded.
*
* The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can
* be set by the producer to signal that a tensor of sub-byte type is padded.
*/
#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL)
/*!
* \brief A versioned and managed C Tensor object, manage memory of DLTensor.
*

View File

@ -171,8 +171,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
POINTWISE_BOXED(fill_.Scalar);
POINTWISE_BOXED(zero_);
// This is special because this op doesn't return anything
m.impl("_assert_tensor_metadata", native::_assert_tensor_metadata);
#undef UNARY_POINTWISE
#undef UNARY_POINTWISE_ALL

View File

@ -81,7 +81,7 @@ Tensor math_channel_shuffle(const Tensor& self, int64_t groups) {
// TODO: contiguous can be made to preserve the memory format
// of the input. However since the above reshape clobbers h and w
// it may not be safe to do that, since channels_last contiguous
// may think oc and the last dim correspond to h,w?
// may think oc and and the last dim correspond to h,w?
// It is not clear, however from initial looking around it feels that
// this may not be correct.
// In this case channels last will likely require custom implementation

View File

@ -1,4 +1,3 @@
#pragma once
#include <ATen/core/Tensor.h>
#include <ATen/Config.h>
#include <cstdint>

View File

@ -67,13 +67,13 @@ TORCH_PRECOMPUTE_META_FUNC(fractional_max_pool3d)(
int64_t inputH = input_.size(heightDim);
int64_t inputW = input_.size(widthDim);
TORCH_CHECK((poolSizeT <= inputT) && (outputT + poolSizeT - 1 < inputT),
TORCH_CHECK(outputT + poolSizeT - 1 < inputT,
"fractional_max_pool3d_out(): pool time ", poolSizeT,
" too large relative to input time ", inputT);
TORCH_CHECK((poolSizeW <= inputW) && (outputW + poolSizeW - 1 < inputW),
TORCH_CHECK(outputW + poolSizeW - 1 < inputW,
"fractional_max_pool3d_out(): pool width ", poolSizeW,
" too large relative to input width ", inputW);
TORCH_CHECK((poolSizeH <= inputH) && (outputH + poolSizeH - 1 < inputH),
TORCH_CHECK(outputH + poolSizeH - 1 < inputH,
"fractional_max_pool3d_out(): pool height ", poolSizeH,
" too large relative to input height ", inputH);

View File

@ -23,6 +23,8 @@ Tensor& max_unpooling2d_forward_out_cpu(
// Nondeterministic with duplicate indices
at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out");
auto oheight = output_size[0];
auto owidth = output_size[1];
TORCH_CHECK(
indices_.scalar_type() == at::ScalarType::Long,
"elements in indices should be type int64 but got: ", indices_.scalar_type());
@ -43,9 +45,6 @@ Tensor& max_unpooling2d_forward_out_cpu(
self_.sizes(), " with dimension ", i , " being empty.");
}
auto oheight = output_size[0];
auto owidth = output_size[1];
auto memory_format = self_.suggest_memory_format();
auto self = self_.contiguous(memory_format);
auto indices = indices_.contiguous(memory_format);

View File

@ -73,7 +73,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value)
for (const auto i : c10::irange((size_t)l_pad)) {
auto pad_idx = pad.size() - ((i + 1) * 2);
auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, "
"which is invalid. Check dimension ", l_diff + i, " of your input.");
new_shape.emplace_back(new_dim);

View File

@ -2174,7 +2174,7 @@ static void _scatter_via_index_put(
if (self.dim() == 1 || broadcast_index) {
Tensor squeezed = index;
if (broadcast_index && index.dim() > 1) {
for (int64_t d = index.dim() - 1; d >= 0; --d) {
for (const auto d : c10::irange(index.dim())) {
if (d == dim) {
continue;
}

View File

@ -52,7 +52,6 @@ void apply_triu_tril_single(
int64_t self_col_stride,
bool upper) {
constexpr int64_t zero = 0;
k = std::clamp(k, -n, m); // Clamp k to [-n, m] to prevent i + k arithmetic overflow, especially if k approaches INT64_MAX/INT64_MIN.
if (upper) {
parallel_for(0, n, 0, [&](int64_t start, int64_t end) {

View File

@ -85,11 +85,11 @@ void cpu_max_unpool(
if constexpr (is_3d) {
TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
" (output volumes are of size ", output_depth,
"x", output_height, "x", output_width, ")");
"x", output_height, "x", output_width);
} else {
TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
" (output volumes are of size ", output_height,
"x", output_width, ")");
"x", output_width);
}
}

View File

@ -1138,14 +1138,9 @@ bool is_blockwise_1x16_scaling(const at::Tensor& t, const at::Tensor& scale) {
bool is_blockwise_1x32_scaling(const at::Tensor& t, const at::Tensor& scale) {
// TODO: We might want to enforce some structure on the shapes of the scale
// tensors
bool is_fp8_path = (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
&& scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4));
bool is_packed_fp4_path = false;
#ifdef USE_ROCM
is_packed_fp4_path = (t.scalar_type() == ScalarType::Float4_e2m1fn_x2 && scale.scalar_type() == at::kFloat8_e8m0fnu
&& scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1) * 2, 32), 4));
#endif
return (is_fp8_path || is_packed_fp4_path) && scale.is_contiguous();
return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
&& scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4)
&& scale.is_contiguous());
}
bool is_blockwise_1x128_scaling(const at::Tensor& t, const at::Tensor& scale) {
@ -1386,15 +1381,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
"Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
int packed_factor = 1;
if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
// For float4 data type, each byte stores two 4-bit floating-point values,
// effectively packing two elements into one byte.
packed_factor = 2;
}
TORCH_CHECK(mat1.size(0) % 16 == 0 && (mat1.size(1) * packed_factor) % 128 == 0 &&
mat2.size(1) % 16 == 0,
"M, N must be multiples of 16 and K must be multiple of 128 for block-wise scaling");
TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
"Matrix dimensions must be multiples of 32 for block-wise scaling");
TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
out.scalar_type() == ScalarType::Half,

View File

@ -51,7 +51,7 @@ std::vector<Tensor> foreach_tensor_list_op(
Op<opmath_t>(),
alpha.to<opmath_t>());
return std::move(tensor_lists[2]);
return tensor_lists[2];
}
template <typename T, template <class> class Op>

View File

@ -45,7 +45,7 @@ std::vector<Tensor> foreach_binary_op(
/* res_arg_index */ 1>(),
Op<opmath_t>(),
scalar.to<opmath_t>());
return std::move(tensor_lists[1]);
return tensor_lists[1];
}
template <typename T, template <class> class Op>

View File

@ -33,7 +33,7 @@ std::vector<Tensor> foreach_binary_op(
}
tensor_lists.emplace_back(tensors.vec());
tensor_lists.emplace_back(std::move(vec_res));
tensor_lists.emplace_back(vec_res);
using opmath_t = at::opmath_type<T>;
multi_tensor_apply<2, opmath_t>(
@ -46,7 +46,7 @@ std::vector<Tensor> foreach_binary_op(
/* res_arg_index */ 1>(),
Op<opmath_t>());
return std::move(tensor_lists[1]);
return tensor_lists[1];
}
template <typename T, template <class> class Op>

View File

@ -56,7 +56,7 @@ std::vector<Tensor> foreach_binary_op(
Op<opmath_t>(),
scalar.data_ptr<T>(),
alpha.to<opmath_t>());
return std::move(tensor_lists[1]);
return tensor_lists[1];
}
template <typename T, template <class> class Op>

View File

@ -57,7 +57,7 @@ std::vector<Tensor> foreach_pointwise_op(
scalar.to<opmath_t>());
});
return std::move(tensor_lists[3]);
return tensor_lists[3];
}
template <template <class> class Op>
@ -160,7 +160,7 @@ std::vector<Tensor> foreach_pointwise_op(
Op<opmath_t>());
});
return std::move(tensor_lists[3]);
return tensor_lists[3];
}
#define FOREACH_POINTWISE_OP_SCALAR(NAME, OP) \

View File

@ -37,7 +37,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
vec_res.emplace_back(at::native::empty_like(t));
}
std::vector<std::vector<at::Tensor>> tensor_lists{
tensors1.vec(), tensors2.vec(), tensors3.vec(), std::move(vec_res)};
tensors1.vec(), tensors2.vec(), tensors3.vec(), vec_res};
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
at::ScalarType::Half,
@ -56,7 +56,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
LerpFunctor<opmath_t>());
});
return std::move(tensor_lists[3]);
return tensor_lists[3];
}
void foreach_tensor_lerp_ternary_cuda_(
@ -104,7 +104,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
vec_res.emplace_back(at::native::empty_like(t));
}
std::vector<std::vector<at::Tensor>> tensor_lists{
tensors1.vec(), tensors2.vec(), std::move(vec_res)};
tensors1.vec(), tensors2.vec(), vec_res};
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
at::ScalarType::Half,
@ -124,7 +124,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
weight.to<opmath_t>());
});
return std::move(tensor_lists[2]);
return tensor_lists[2];
}
void foreach_tensor_lerp_list_cuda_(
@ -173,7 +173,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
vec_res.emplace_back(at::native::empty_like(t));
}
std::vector<std::vector<at::Tensor>> tensor_lists{
tensors1.vec(), tensors2.vec(), std::move(vec_res)};
tensors1.vec(), tensors2.vec(), vec_res};
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
at::ScalarType::Half,
@ -193,7 +193,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
LerpFunctor<opmath_t>());
});
return std::move(tensor_lists[2]);
return tensor_lists[2];
}
void foreach_tensor_lerp_scalarlist_cuda_(

View File

@ -67,7 +67,7 @@ std::vector<Tensor> foreach_unary_op(TensorList tensors) {
/* res_arg_index */ 1>(),
Op<opmath_t>());
return std::move(tensor_lists[1]);
return tensor_lists[1];
}
template <typename scalar_t, template <class> class Op>

View File

@ -125,6 +125,8 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
TORCH_CHECK(
indices_.scalar_type() == at::ScalarType::Long,
"elements in indices should be type int64 but got: ", indices_.scalar_type());
auto oheight = output_size[0];
auto owidth = output_size[1];
TensorArg output_arg{output, "output", 1}, self_arg{self_, "self_", 2},
indices_arg{indices_, "indices_", 3};
@ -147,9 +149,6 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
output_size.size() == 2,
"There should be exactly two elements (height, width) in output_size, but got ", output_size.size(), " elements.");
auto oheight = output_size[0];
auto owidth = output_size[1];
int64_t dimw = 2;
int64_t dimh = 1;
int64_t numBatch = 1;
@ -218,6 +217,9 @@ static void max_unpooling3d_shape_check(
IntArrayRef stride,
IntArrayRef padding,
const char *fn_name) {
int64_t oT = output_size[0];
int64_t oH = output_size[1];
int64_t oW = output_size[2];
TORCH_CHECK(
indices.scalar_type() == at::ScalarType::Long,
"elements in indices should be type int64 but got: ", indices.scalar_type());
@ -248,10 +250,6 @@ static void max_unpooling3d_shape_check(
"strides should be greater than zero, but got stride: ",
stride);
int64_t oT = output_size[0];
int64_t oH = output_size[1];
int64_t oW = output_size[2];
int dimw = 3;
int dimh = 2;
int dimt = 1;
@ -404,6 +402,8 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
const Tensor& indices_,
IntArrayRef output_size,
Tensor& grad_input) {
int64_t oheight = output_size[0];
int64_t owidth = output_size[1];
TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
TORCH_CHECK(
indices_.scalar_type() == at::ScalarType::Long,
@ -426,9 +426,6 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
TORCH_CHECK(output_size.size() == 2, "output_size must have two elements, got size: ", output_size.size());
int64_t oheight = output_size[0];
int64_t owidth = output_size[1];
int64_t nInputCols, nInputRows, nInputPlane;
int dimw = 2;
@ -508,14 +505,13 @@ at::Tensor& max_unpooling3d_backward_out_cuda(const Tensor& grad_output_,
IntArrayRef padding,
Tensor& grad_input) {
TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
max_unpooling3d_shape_check(
self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()");
int64_t oT = output_size[0];
int64_t oH = output_size[1];
int64_t oW = output_size[2];
max_unpooling3d_shape_check(
self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()");
int batchSize = 0;
int inputSlices = 0;
int inputTime = 0;

View File

@ -300,6 +300,8 @@ void nonzero_static_cuda_out_impl(
int64_t size,
int64_t fill_value,
Tensor& out) {
#if defined(CUDA_VERSION) || defined(USE_ROCM)
Tensor self_contiguous_ = self.contiguous();
// see comment in nonzero_cuda_out_impl on reqs for out
bool out_correct_size =
@ -315,17 +317,6 @@ void nonzero_static_cuda_out_impl(
out_temp =
Tensor(at::detail::empty_cuda({self.dim(), size}, out.options())).t();
}
// If input has zero elements, avoid kernel grid calculations (which can
// produce zero divisors) and just fill the output with fill_value.
if (self.numel() == 0) {
if (need_to_copy) {
out_temp.fill_(fill_value);
out.copy_(out_temp);
} else {
out.fill_(fill_value);
}
return;
}
int64_t* out_data_ptr = need_to_copy ? out_temp.mutable_data_ptr<int64_t>()
: out.mutable_data_ptr<int64_t>();
@ -375,6 +366,9 @@ void nonzero_static_cuda_out_impl(
if (need_to_copy) {
out.copy_(out_temp);
}
#else
TORCH_CHECK(false, "Nonzero_static is not supported for cuda <= 11.4");
#endif
}
Tensor& nonzero_out_cuda(const Tensor& self, Tensor& out) {

View File

@ -416,7 +416,6 @@ struct ReduceOp {
if (config.should_block_y_reduce()) {
value = block_y_reduce<output_vec_size>(value, shared_memory);
}
__syncthreads();
if (config.should_block_x_reduce()) {
value = block_x_reduce<output_vec_size>(value, shared_memory);
}

View File

@ -17,11 +17,12 @@ __global__ static void compute_cuda_kernel(
index_t* result_ptr,
int64_t size,
int64_t result_size) {
CUDA_KERNEL_ASSERT_PRINTF(
result_size == cumsum_ptr[size - 1],
if (C10_UNLIKELY((result_size != cumsum_ptr[size - 1]))) {
printf("%s:%d:%s: block: [%d,%d,%d], thread: [%d,%d,%d] "
"Invalid input! In `repeat_interleave`, the `output_size` argument (%ld) must be the same as the sum of the elements in the `repeats` tensor (%ld).\n",
result_size,
cumsum_ptr[size - 1]);
__FILE__, __LINE__, __func__,blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, result_size, cumsum_ptr[size - 1 ]);
CUDA_KERNEL_ASSERT(result_size == cumsum_ptr[size - 1])
}
int64_t idx = ((int64_t) blockIdx.x) * blockDim.x + threadIdx.x;
int64_t stride = (blockDim.x * gridDim.x) / C10_WARP_SIZE;

View File

@ -226,38 +226,6 @@ __global__ void CatArrayBatchedCopy_contig(
}
}
template <typename T, typename IndexType, int Dims, int batch_size, int stride_size, int alignment, int elems_per_vec>
__global__ void CatArrayBatchedCopy_vectorized(
char* output,
CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs,
TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
const int concatDim,
IndexType trailingSize) {
IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
IndexType nElements = inputs.nElements[blockIdx.y] / elems_per_vec;
if(tid >= nElements) return;
const char * data = (char*)inputs.input[blockIdx.y];
IndexType offset = inputs.offset[blockIdx.y] * trailingSize / elems_per_vec;
IndexType dimSize = inputs.dimSize[blockIdx.y] * trailingSize / elems_per_vec;
int64_t dataOffset = (int64_t)offset * alignment; // in bytes
IndexType stride = gridDim.x * blockDim.x;
while( tid < nElements){
int64_t elementOffset = (int64_t)CatArrIndexToOffset<IndexType, Dims>::compute(
os.tensorSize, os.tensorStride, dimSize, concatDim, tid) * alignment; // in bytes
auto vec = at::native::memory::ld_vec<alignment>(data + (int64_t)alignment * tid);
at::native::memory::st_vec<alignment>(output + dataOffset + elementOffset, vec);
tid += stride;
}
}
/*
Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads
to improve memory bandwidth throughput.
@ -328,27 +296,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
scalar_t *data = (scalar_t *)(out.mutable_data_ptr());
CatArrInputTensorMetadata<scalar_t, unsigned int, batch_size, stride_size> catMetaData;
TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam;
// If all batches are contiguous we can call a specialized implementation
// which requires the input tensor addresses to be aligned to a
// 16 Byte boundary.
constexpr bool isContig = stride_size == 1;
bool isAligned = true;
constexpr int alignment = 16;
// Next, let's initialize the size, stride arrays for the output Tensor.
// for contig case, we'll canonicalize output strides, so that
// we don't have arbitrary strides for dims of size 0
size_t stride0 = 1;
if (memory_format == c10::MemoryFormat::Contiguous) {
for (int i = nDims - 1; i >= 0; --i) {
for (int i = 0; i < nDims; ++i) {
outputParam.tensorSize[i] = out.size(i);
if (isContig) {
outputParam.tensorStride[i] = stride0;
stride0 *= out.size(i);
} else {
outputParam.tensorStride[i] = out.stride(i);
}
outputParam.tensorStride[i] = out.stride(i);
}
} else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) {
// permute the semantics of dims from NCHW to NHWC so that the input
@ -367,15 +320,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
// If all batches are contiguous we can call a specialized implementation
// which requires the input tensor addresses to be aligned to a
// 16 Byte boundary.
// for channels last computing slice size correctly is much more involved, so we never send it
// on the fully vectorized path
// we need output stride in cat dimension to be multiple of alignment,
// if we ever use it to compute offsets
// for catting in 0th dimension it doesn't matter
bool isInOutAligned = isContig && at::native::memory::get_alignment(data) >= alignment &&
memory_format == c10::MemoryFormat::Contiguous && (dimension == 0 ||
outputParam.tensorStride[dimension - 1] * sizeof(scalar_t) % alignment == 0);
bool isContig = true;
bool isAligned = true;
unsigned int max_elements_per_tensor = 0;
// Now we loop
@ -391,16 +341,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
// high-dimensional tensor
if (inputs[i+batchCounter].get().numel() > 0) {
dimSize = inputs[i+batchCounter].get().size(dimension);
if (isInOutAligned) {
auto t = inputs[i+batchCounter].get();
// similarly to output stride, we cannot trust stride value to
// determine slice size if the corresponding dimension is 1
// we have to multiply all the subsequent sizes
int64_t slice_size = dimension == 0 ? t.numel() : t.sizes()[dimension - 1] != 1 ?
t.strides()[dimension - 1] : c10::multiply_integers(t.sizes().begin() + dimension, t.sizes().end());
slice_size *= sizeof(scalar_t);
isInOutAligned &= (slice_size % alignment == 0);
}
}
catMetaData.input[batchCounter] = (scalar_t*)(inputs[i+batchCounter].get().const_data_ptr());
@ -411,12 +351,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
#ifdef USE_ROCM
// On ROCm, CatArrayBatchedCopy_contig is faster
isAligned = false;
isInOutAligned = false;
#else
// If at least one of the inputs is not aligned, we can't call the
// CatArrayBatchedCopy_alignedK_contig
isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]);
isInOutAligned &= at::native::memory::get_alignment(catMetaData.input[batchCounter]) >= alignment;
#endif
if (stride_size > 1) {
@ -427,6 +365,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j];
}
catMetaData.isContiguous[batchCounter] = false;
isContig = false;
} else {
catMetaData.isContiguous[batchCounter] = true;
}
@ -449,13 +388,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
max_elements_per_tensor, batchCounter);
#else
dim3 applyBlock, catGrid;
if (isInOutAligned) {
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, alignment>(
max_elements_per_tensor, batchCounter);
} else if (isContig && isAligned && sizeof(scalar_t) > 2) {
if (isContig && sizeof(scalar_t) > 2) {
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_16>(
max_elements_per_tensor, batchCounter);
} else if (isContig && isAligned && sizeof(scalar_t) == 2) {
} else if (isContig && sizeof(scalar_t) == 2) {
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_8>(
max_elements_per_tensor, batchCounter);
} else {
@ -463,30 +399,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
getCatGrid(batchCounter, catGrid);
}
#endif
int32_t trailingSize;
TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
if (isInOutAligned) {
// in this case we can and should flatten the tensors after the cat dim
// we want to view the tensors as if consisting of `alignment`-sized elements
// however, we might not be able to cleanly divide just the last dim -
// it might not be the multiple of alignment.
// however, we know that the full concatted slice is multiple of alignment,
// so if we flatten all the dims after and including concat dim,
// it will be divisible by alignment
// then we need to divide last out size by elems_per_vec,
// and divide all strides except last by elems_per_vec (last stride is 1 always)
// for input, we will fix up the sizes and strides in the kernel directly
kernelOutputParam = outputParam;
nDims = dimension + 1;
constexpr auto elems_per_vec = alignment / sizeof(scalar_t);
auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1];
kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec;
trailingSize = outputParam.tensorStride[dimension];
kernelOutputParam.tensorStride[dimension] = 1;
for (int i = 0; i < dimension; ++i) {
kernelOutputParam.tensorStride[i] /= elems_per_vec;
}
}
if (memory_format != c10::MemoryFormat::Contiguous) {
switch (dimension) {
@ -501,12 +413,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
}
// Template Declarations for dim = 1, 2, 3, 4
#define HANDLE_CASE(DIMS) \
if (isInOutAligned) {\
constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \
CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\
catGrid, applyBlock, 0, stream.stream()>>>(\
(char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\
} else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
catGrid, applyBlock, 0, stream.stream()>>>(\
data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\

View File

@ -221,9 +221,22 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_
std::optional<CuFFTConfig> uncached_plan;
const CuFFTConfig * config = nullptr;
// Workaround for gh-63152, gh-58724
// Bluestein plans in CUDA 11.1 (cufft 10.3) cannot be re-used
// Bluestein's algorithm is only used when a size has large prime factors,
// sizes with only small prime factors can still be cached
if (plan_cache.max_size() > 0) {
bool use_caching = true;
#ifdef CUFFT_VERSION
if constexpr (10300 <= CUFFT_VERSION && CUFFT_VERSION < 10400) {
// Only cache plans for transforms with small prime factors
use_caching = std::none_of(
signal_size.begin() + 1, signal_size.end(), [](int64_t dim_size) {
return has_large_prime_factor(dim_size);
});
}
#endif
if (use_caching && plan_cache.max_size() > 0) {
guard.lock();
if (plan_cache.max_size() > 0) { // check again after acquiring the lock
config = &plan_cache.lookup(Params);

View File

@ -5,20 +5,12 @@
namespace at::native {
__global__ void weight_int8pack_mm_kernel(
const float* x,
const int8_t* w,
const float* scale,
float* out,
int B,
int K,
int N) {
__global__ void weight_int8pack_mm_kernel(const float* x, const int8_t* w, const float* scale, float* out, int B, int K, int N) {
// one thread per output element: [B, N]
int b = blockIdx.y * blockDim.y + threadIdx.y;
int n = blockIdx.x * blockDim.x + threadIdx.x;
if (b >= B || n >= N)
return;
if (b >= B || n >= N) return;
float acc = 0.0f;
for (int k = 0; k < K; ++k) {
@ -28,11 +20,7 @@ __global__ void weight_int8pack_mm_kernel(
out[b * N + n] = acc * scale[n];
}
void launch_weight_int8pack_mm_cuda_kernel(
const Tensor& x,
const Tensor& w_int8,
const Tensor& scale,
Tensor& out) {
void launch_weight_int8pack_mm_cuda_kernel(const Tensor& x, const Tensor& w_int8, const Tensor& scale, Tensor& out) {
const int B = x.size(0);
const int K = x.size(1);
const int N = w_int8.size(0);
@ -47,16 +35,12 @@ void launch_weight_int8pack_mm_cuda_kernel(
w_int8.data_ptr<int8_t>(),
scale.data_ptr<float>(),
out.data_ptr<float>(),
B,
K,
N);
B, K, N);
}
// Main GPU entry point
at::Tensor _weight_int8pack_mm_cuda(
const at::Tensor& x,
const at::Tensor& w_int8,
const at::Tensor& scale) {
at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int8, const at::Tensor& scale) {
// --- Check inputs ---
TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor");
TORCH_CHECK(w_int8.is_cuda(), "w must be a CUDA tensor");
@ -66,16 +50,12 @@ at::Tensor _weight_int8pack_mm_cuda(
TORCH_CHECK(w_int8.dim() == 2, "w must be 2D");
TORCH_CHECK(scale.dim() == 1, "scale must be 1D");
TORCH_CHECK(
x.size(1) == w_int8.size(1),
"K dimension mismatch: x.size(1) != w.size(1)");
TORCH_CHECK(
w_int8.size(0) == scale.size(0),
"Output dim mismatch: w.size(0) != scale.size(0)");
TORCH_CHECK(x.size(1) == w_int8.size(1), "K dimension mismatch: x.size(1) != w.size(1)");
TORCH_CHECK(w_int8.size(0) == scale.size(0), "Output dim mismatch: w.size(0) != scale.size(0)");
// --- Determine shapes ---
auto B = x.size(0); // batch size
auto N = w_int8.size(0); // output dim
auto B = x.size(0); // batch size
auto N = w_int8.size(0); // output dim
// Ensure inputs are in the correct types for the kernel
auto x_f32 = x.to(at::kFloat);
@ -83,13 +63,12 @@ at::Tensor _weight_int8pack_mm_cuda(
auto scale_f32 = scale.to(at::kFloat);
// --- Allocate output ---
auto out = at::empty({B, N}, x_f32.options());
auto out = at::empty({B, N}, x.options().dtype(at::kFloat));
// --- Launch kernel ---
launch_weight_int8pack_mm_cuda_kernel(
x_f32, w_int8_contiguous, scale_f32, out);
launch_weight_int8pack_mm_cuda_kernel(x_f32, w_int8_contiguous, scale_f32, out);
return out.to(x.dtype());
return out;
}
} // namespace at::native

View File

@ -482,9 +482,7 @@ auto build_graph(
auto scaled_dot_product_flash_attention_options =
fe::graph::SDPA_attributes()
.set_name("CUDNN_SDPA")
.set_is_inference(return_softmaxstats == false)
// TODO(eqy): switch to this API once cuDNN FE is upgraded
// .set_generate_stats(return_softmaxstats)
.set_generate_stats(return_softmaxstats)
.set_causal_mask(is_causal)
.set_attn_scale(attn_scale);
if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
@ -704,9 +702,7 @@ auto build_graph_nestedtensor(
auto scaled_dot_product_flash_attention_options =
fe::graph::SDPA_attributes()
.set_name("CUDNN_SDPA_NESTEDTENSOR")
.set_is_inference(return_softmaxstats == false)
// TODO(eqy): switch to this API once cuDNN FE is upgraded
// .set_generate_stats(return_softmaxstats)
.set_generate_stats(return_softmaxstats)
.set_causal_mask(is_causal)
.set_attn_scale(attn_scale)
.set_seq_len_q(SEQ_LEN_Q_)

View File

@ -2,7 +2,6 @@
#include <ATen/core/Tensor.h>
#include <ATen/TensorUtils.h>
#include <ATen/div_rtn.h>
#include <c10/util/safe_numerics.h>
namespace at::native {
@ -55,14 +54,6 @@ inline void col2im_shape_check(
int64_t batch_dim = (ndim == 3) ? 0 : -1;
int64_t n_input_plane = input.size(batch_dim + 1);
uint64_t prod_kernel_size = 1;
TORCH_CHECK(!c10::mul_overflows(static_cast<uint64_t>(kernel_width), static_cast<uint64_t>(kernel_height), &prod_kernel_size),
"Given kernel_width = ",
kernel_width,
" and kernel_height = ",
kernel_height,
" the product of kernel_width and kernel_height overflowed.");
if (n_input_plane % (kernel_width * kernel_height) != 0) {
TORCH_CHECK(false,

View File

@ -1770,12 +1770,10 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> miopen_depthwise_convolution_back
// fusions
// ---------------------------------------------------------------------
void raw_miopen_convolution_add_relu_out(
void raw_miopen_convolution_relu_out(
const Tensor& output,
const Tensor& input,
const Tensor& weight,
const Tensor& z,
float alpha,
const Tensor& bias,
IntArrayRef stride,
IntArrayRef padding,
@ -1783,20 +1781,68 @@ void raw_miopen_convolution_add_relu_out(
int64_t groups,
bool benchmark,
bool deterministic) {
raw_miopen_convolution_forward_out(
output,
auto dataType = getMiopenDataType(input);
miopenConvolutionMode_t c_mode = miopenConvolution;
ConvolutionArgs args{ input, output, weight };
args.handle = getMiopenHandle();
at::MemoryFormat memory_format = miopen_conv_suggest_memory_format(input, weight);
setConvolutionParams(
&args.params,
args.handle,
input,
weight,
padding,
stride,
dilation,
groups,
deterministic,
memory_format);
args.idesc.set(input, memory_format);
args.wdesc.set(weight, memory_format, 0);
args.odesc.set(output, memory_format);
args.cdesc.set(
dataType,
c_mode,
input.dim() - 2,
args.params.padding,
args.params.stride,
args.params.dilation,
args.params.groups,
benchmark,
deterministic);
at::Tensor alpha_mul_z_add_bias =
at::native::reshape_bias(input.dim(), bias).add(z, alpha);
output.add_(alpha_mul_z_add_bias);
output.relu_();
TensorDescriptor bdesc;
bdesc.set(bias.expand({1, bias.size(0)}), output.dim());
// Create the fusion plan
miopenFusionPlanDescriptor_t fusePlanDesc;
miopenFusionOpDescriptor_t convoOp;
miopenFusionOpDescriptor_t biasOp;
miopenFusionOpDescriptor_t activOp;
MIOPEN_CHECK(miopenCreateFusionPlan(&fusePlanDesc, miopenVerticalFusion, args.idesc.desc()));
MIOPEN_CHECK(miopenCreateOpConvForward(fusePlanDesc, &convoOp, args.cdesc.desc(), args.wdesc.desc()));
MIOPEN_CHECK(miopenCreateOpBiasForward(fusePlanDesc, &biasOp, bdesc.desc()));
MIOPEN_CHECK(miopenCreateOpActivationForward(fusePlanDesc, &activOp, miopenActivationRELU));
// compile fusion plan
MIOPEN_CHECK(miopenCompileFusionPlan(args.handle, fusePlanDesc));
// Set the Args
float alpha = static_cast<float>(1);
float beta = static_cast<float>(0);
float activ_alpha = static_cast<float>(0);
float activ_beta = static_cast<float>(0);
float activ_gamma = static_cast<float>(0);
miopenOperatorArgs_t fusionArgs;
MIOPEN_CHECK(miopenCreateOperatorArgs(&fusionArgs));
MIOPEN_CHECK(miopenSetOpArgsConvForward(fusionArgs, convoOp, &alpha, &beta, weight.const_data_ptr()));
MIOPEN_CHECK(miopenSetOpArgsBiasForward(fusionArgs, biasOp, &alpha, &beta, bias.const_data_ptr()));
MIOPEN_CHECK(miopenSetOpArgsActivForward(fusionArgs, activOp, &alpha, &beta, activ_alpha, activ_beta, activ_gamma));
miopenExecuteFusionPlan(args.handle, fusePlanDesc, args.idesc.desc(), input.const_data_ptr(), args.odesc.desc(), output.data_ptr(), fusionArgs);
// Cleanup
miopenDestroyFusionPlan(fusePlanDesc);
}
static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat memory_format) {
@ -1809,107 +1855,171 @@ static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat m
Tensor miopen_convolution_add_relu(
const Tensor& input_t,
const Tensor& weight_t,
const Tensor& z_t,
const Tensor& z,
const std::optional<Scalar>& alpha,
const std::optional<Tensor>& bias_t,
const std::optional<Tensor>& bias,
IntArrayRef stride,
IntArrayRef padding,
IntArrayRef dilation,
int64_t groups) {
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
const Tensor input = input_t.contiguous(memory_format);
const Tensor weight = weight_t.contiguous(memory_format);
Tensor z = z_t;
if (z.suggest_memory_format() != memory_format) {
z = z.to(memory_format);
}
z = z.contiguous(memory_format);
// FuseFrozenConvAddRelu performs some tensor shape checking
Tensor output_t = at::detail::empty_cuda(
conv_output_size(
input.sizes(), weight.sizes(), padding, stride, dilation),
input.options().memory_format(memory_format));
if (output_t.numel() == 0) {
return output_t;
}
// MIOpen does not support fusion of add, the alpha2 * z step of the below cuDNN function:
// y = act ( alpha1 * conv(x) + alpha2 * z + bias )
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
auto& ctx = at::globalContext();
bool benchmark = ctx.benchmarkCuDNN();
auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
auto _bias = bias_t.has_value()
? bias_t.value()
: at::zeros(
{output_t.size(1)},
optTypeMetaToScalarType(output_t.options().dtype_opt()),
output_t.options().layout_opt(),
output_t.options().device_opt(),
output_t.options().pinned_memory_opt());
raw_miopen_convolution_add_relu_out(
output_t,
TensorArg input { input_t, "input", 1 },
weight { weight_t, "weight", 2 };
Tensor output_t = at::detail::empty_cuda(
conv_output_size(
input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
input_t.options().memory_format(memory_format));
if (output_t.numel() == 0){
return output_t;
}
// Avoid ambiguity of "output" when this is being used as backwards
TensorArg output{output_t, "result", 0};
miopen_convolution_forward_out(
output,
"miopen_convolution_add_relu",
input,
weight,
z,
_alpha,
_bias,
stride,
padding,
stride,
dilation,
groups,
benchmark,
true); // deterministic
false // deterministic
);
return output_t;
auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
if (!output_t.is_same(contig_output_t)) {
contig_output_t.copy_(output_t);
}
auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
auto _bias = bias.has_value()
? bias.value()
: at::zeros(
{contig_output_t.size(1)},
optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
contig_output_t.options().layout_opt(),
contig_output_t.options().device_opt(),
contig_output_t.options().pinned_memory_opt());
at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input_t.dim(), _bias).add(z, _alpha);
contig_output_t.add_(alpha_mul_z_add_bias);
contig_output_t.relu_();
return contig_output_t;
}
Tensor miopen_convolution_relu(
const Tensor& input_t,
const Tensor& weight_t,
const std::optional<Tensor>& bias_t,
const std::optional<Tensor>& bias,
IntArrayRef stride,
IntArrayRef padding,
IntArrayRef dilation,
int64_t groups) {
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
const Tensor input = input_t.contiguous(memory_format);
const Tensor weight = weight_t.contiguous(memory_format);
// FuseFrozenConvAddRelu performs some tensor shape checking
Tensor output_t = at::detail::empty_cuda(
conv_output_size(
input.sizes(), weight.sizes(), padding, stride, dilation),
input.options().memory_format(memory_format));
if (output_t.numel() == 0) {
return output_t;
}
auto& ctx = at::globalContext();
bool benchmark = ctx.benchmarkCuDNN();
auto _bias = bias_t.has_value()
? bias_t.value()
: at::zeros(
{output_t.size(1)},
optTypeMetaToScalarType(output_t.options().dtype_opt()),
output_t.options().layout_opt(),
output_t.options().device_opt(),
output_t.options().pinned_memory_opt());
raw_miopen_convolution_add_relu_out(
output_t,
input,
weight,
output_t, // use output_t as z to satisfy MIOpen API
0, // alpha
_bias,
stride,
padding,
dilation,
groups,
benchmark, // benchmark
true); // deterministic
// MIOpen currently only supports MemoryFormat::Contiguous and fp32 and 2d
if (input_t.suggest_memory_format() == at::MemoryFormat::Contiguous
&& input_t.scalar_type() == at::kFloat
&& input_t.ndimension() == 4) {
return output_t;
// FuseFrozenConvAddRelu performs some tensor shape checking
Tensor output_t = at::detail::empty_cuda(
conv_output_size(
input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
input_t.options().memory_format(input_t.suggest_memory_format()));
if (output_t.numel() == 0) {
return output_t;
}
auto _bias = bias.has_value()
? bias.value()
: at::zeros(
{output_t.size(1)},
optTypeMetaToScalarType(output_t.options().dtype_opt()),
output_t.options().layout_opt(),
output_t.options().device_opt(),
output_t.options().pinned_memory_opt());
raw_miopen_convolution_relu_out(
output_t,
input_t,
weight_t,
_bias,
stride,
padding,
dilation,
groups,
benchmark, // benchmark
false // deterministic
);
return output_t;
}
else {
// fallback
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
TensorArg input { input_t, "input", 1 },
weight { weight_t, "weight", 2 };
Tensor output_t = at::detail::empty_cuda(
conv_output_size(
input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
input->options().memory_format(memory_format));
if (output_t.numel() == 0){
return output_t;
}
// Avoid ambiguity of "output" when this is being used as backwards
TensorArg output{output_t, "result", 0};
miopen_convolution_forward_out(
output,
"miopen_convolution_relu",
input,
weight,
padding,
stride,
dilation,
groups,
benchmark,
false // deterministic
);
auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
if (!output_t.is_same(contig_output_t)) {
contig_output_t.copy_(output_t);
}
auto _bias = bias.has_value()
? bias.value()
: at::zeros(
{contig_output_t.size(1)},
optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
contig_output_t.options().layout_opt(),
contig_output_t.options().device_opt(),
contig_output_t.options().pinned_memory_opt());
at::Tensor reshaped_bias = at::native::reshape_bias(input_t.dim(), _bias);
contig_output_t.add_(reshaped_bias);
contig_output_t.relu_();
return contig_output_t;
}
}
REGISTER_CUDA_DISPATCH(miopen_convolution_backward_stub, &miopen_convolution_backward)

View File

@ -559,60 +559,4 @@ Tensor _int_mm_xpu(const Tensor& self, const Tensor& mat2) {
at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt));
return _int_mm_out_xpu(self, mat2, result);
}
Tensor _weight_int8pack_mm_xpu(
const Tensor& A,
const Tensor& B,
const Tensor& scales) {
auto M = A.size(0);
auto N = B.size(0);
auto K = A.size(1);
TORCH_CHECK(
A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat,
" : expect A to be either 32-bit or 16-bit float tensor.");
TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor.");
TORCH_CHECK(
A.stride(1) == 1, " : A must be contiguous on the last dimension.");
TORCH_CHECK(B.dtype() == kChar, " : expect B to be int8 tensor.");
TORCH_CHECK(B.is_contiguous(), " : expect B to be contiguous.");
TORCH_CHECK(B.size(1) == K, " : expect B.size(1) == ", K);
TORCH_CHECK(
scales.dim() == 1 && scales.size(0) == N,
" : expect scales to be 1d tensor with size ",
N);
auto C = at::empty({M, N}, A.options());
// --- Launch kernel ---
Tensor bias = at::Tensor();
Tensor mat2_zero_points = at::Tensor();
Tensor non_const_scales = scales;
auto post_op_args = torch::List<std::optional<at::Scalar>>();
at::native::onednn::quantized_matmul(
A.contiguous(),
1.0,
0,
B,
non_const_scales,
mat2_zero_points,
bias,
C,
1.0,
0,
C.scalar_type(),
/*other*/ std::nullopt,
/*other scale*/ 1.0,
/*other zp*/ 0,
/*binary post op*/ "none",
/*binary alpha*/ 1.0,
/*post_op_name*/ "none",
post_op_args,
/*post_op_algorithm*/ "none",
/*m2_trans*/ false);
return C;
}
} // namespace at::native

View File

@ -110,9 +110,8 @@ void quantized_matmul(
// [Note] Quantized Matrix Multiplication at XPU
// The following code integrates oneDNN quantized gemm. The quantization
// config we support:
// activation: s8, u8, fp16, bf16, fp32; per tensor calibrated;
// symmetric&asymmetric weight: s8; per_tensor/per_channel calibrated;
// symmetric
// activation: s8&u8; per tensor calibrated; symmetric&asymmetric
// weight: s8; per_tensor/per_channel calibrated; symmetric
auto attr = Attr(static_cast<float>(1.0 / output_scale), output_zero_point);
construct_attr_by_post_op(
binary_post_op,

View File

@ -0,0 +1,48 @@
#pragma once
#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
typedef NS_ENUM(NSUInteger, MPSGraphFFTScalingMode) {
MPSGraphFFTScalingModeNone = 0L,
MPSGraphFFTScalingModeSize = 1L,
MPSGraphFFTScalingModeUnitary = 2L,
};
@interface FakeMPSGraphFFTDescriptor : NSObject<NSCopying>
@property(readwrite, nonatomic) BOOL inverse;
@property(readwrite, nonatomic) MPSGraphFFTScalingMode scalingMode;
@property(readwrite, nonatomic) BOOL roundToOddHermitean;
+ (nullable instancetype)descriptor;
@end
@compatibility_alias MPSGraphFFTDescriptor FakeMPSGraphFFTDescriptor;
@interface MPSGraph (SonomaOps)
- (MPSGraphTensor* _Nonnull)conjugateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)realPartOfTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)fastFourierTransformWithTensor:(MPSGraphTensor* _Nonnull)tensor
axes:(NSArray<NSNumber*>* _Nonnull)axes
descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)realToHermiteanFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
axes:(NSArray<NSNumber*>* _Nonnull)axes
descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)HermiteanToRealFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
axes:(NSArray<NSNumber*>* _Nonnull)axes
descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
name:(NSString* _Nullable)name;
@end
// define BFloat16 enums for MacOS13
#define MPSDataTypeBFloat16 ((MPSDataType)(MPSDataTypeAlternateEncodingBit | MPSDataTypeFloat16))
// define Metal version
#define MTLLanguageVersion3_1 ((MTLLanguageVersion)((3 << 16) + 1))
#endif

View File

@ -0,0 +1,196 @@
#pragma once
#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
// TODO: Remove me when moved to MacOS 13
#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
@interface FakeMPSGraphConvolution3DOpDescriptor : NSObject<NSCopying>
@property(readwrite, nonatomic) NSUInteger strideInX;
@property(readwrite, nonatomic) NSUInteger strideInY;
@property(readwrite, nonatomic) NSUInteger strideInZ;
@property(readwrite, nonatomic) NSUInteger dilationRateInX;
@property(readwrite, nonatomic) NSUInteger dilationRateInY;
@property(readwrite, nonatomic) NSUInteger dilationRateInZ;
@property(readwrite, nonatomic) NSUInteger paddingLeft;
@property(readwrite, nonatomic) NSUInteger paddingRight;
@property(readwrite, nonatomic) NSUInteger paddingTop;
@property(readwrite, nonatomic) NSUInteger paddingBottom;
@property(readwrite, nonatomic) NSUInteger paddingFront;
@property(readwrite, nonatomic) NSUInteger paddingBack;
@property(readwrite, nonatomic) MPSGraphPaddingStyle paddingStyle;
@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout dataLayout;
@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout weightsLayout;
@property(readwrite, nonatomic) NSUInteger groups;
@end
@compatibility_alias MPSGraphConvolution3DOpDescriptor FakeMPSGraphConvolution3DOpDescriptor;
#endif
@interface MPSGraph (VenturaOps)
#if !defined(__MAC_13_0) && (!defined(MAC_OS_X_VERSION_13_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_0))
typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode) {
MPSGraphResizeNearestRoundingModeRoundPreferCeil = 0L,
MPSGraphResizeNearestRoundingModeRoundPreferFloor = 1L,
MPSGraphResizeNearestRoundingModeCeil = 2L,
MPSGraphResizeNearestRoundingModeFloor = 3L,
MPSGraphResizeNearestRoundingModeRoundToEven = 4L,
MPSGraphResizeNearestRoundingModeRoundToOdd = 5L,
};
// Define complex enums for MacOS 12
#define MPSDataTypeComplexBit 0x01000000
#define MPSDataTypeComplexFloat32 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 64))
#define MPSDataTypeComplexFloat16 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 32))
#endif
- (MPSGraphTensor* _Nonnull)convolution3DWithSourceTensor:(MPSGraphTensor* _Nonnull)source
weightsTensor:(MPSGraphTensor* _Nonnull)weights
descriptor:(MPSGraphConvolution3DOpDescriptor* _Nonnull)descriptor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)
convolution3DDataGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
weightsTensor:(MPSGraphTensor* _Nonnull)weights
outputShape:(MPSShape* _Nonnull)outputShape
forwardConvolutionDescriptor:
(MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)
convolution3DWeightsGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
sourceTensor:(MPSGraphTensor* _Nonnull)source
outputShape:(MPSShape* _Nonnull)outputShape
forwardConvolutionDescriptor:
(MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)cumulativeSumWithTensor:(MPSGraphTensor* _Nonnull)tensor
axis:(NSInteger)axis
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axis:(NSInteger)axis
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axis:(NSInteger)axis
descending:(BOOL)descending
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
descending:(BOOL)descending
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axis:(NSInteger)axis
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axis:(NSInteger)axis
descending:(BOOL)descending
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
descending:(BOOL)descending
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)inverseOfTensor:(MPSGraphTensor* _Nonnull)inputTensor name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
sizeTensor:(MPSGraphTensor* _Nonnull)size
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
centerResult:(BOOL)centerResult
alignCorners:(BOOL)alignCorners
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
sizeTensor:(MPSGraphTensor* _Nonnull)size
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
sizeTensor:(MPSGraphTensor* _Nonnull)size
centerResult:(BOOL)centerResult
alignCorners:(BOOL)alignCorners
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
sizeTensor:(MPSGraphTensor* _Nonnull)size
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
input:(MPSGraphTensor* _Nonnull)input
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
centerResult:(BOOL)centerResult
alignCorners:(BOOL)alignCorners
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
input:(MPSGraphTensor* _Nonnull)input
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
input:(MPSGraphTensor* _Nonnull)input
centerResult:(BOOL)centerResult
alignCorners:(BOOL)alignCorners
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
input:(MPSGraphTensor* _Nonnull)input
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
layout:(MPSGraphTensorNamedDataLayout)layout
normalizeCoordinates:(BOOL)normalizeCoordinates
relativeCoordinates:(BOOL)relativeCoordinates
alignCorners:(BOOL)alignCorners
paddingMode:(MPSGraphPaddingMode)paddingMode
samplingMode:(MPSGraphResizeMode)samplingMode
constantValue:(double)constantValue
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
layout:(MPSGraphTensorNamedDataLayout)layout
normalizeCoordinates:(BOOL)normalizeCoordinates
relativeCoordinates:(BOOL)relativeCoordinates
alignCorners:(BOOL)alignCorners
paddingMode:(MPSGraphPaddingMode)paddingMode
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
constantValue:(double)constantValue
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)truncateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
@end

View File

@ -9,6 +9,8 @@
#include <ATen/mps/MPSAllocatorInterface.h>
#include <ATen/mps/MPSProfiler.h>
#include <ATen/native/mps/MPSGraphSequoiaOps.h>
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <fmt/format.h>
#include <fmt/ranges.h>
@ -568,7 +570,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
MPSShape* mpsStrides = getMPSShape(_tensor.strides());
check_mps_shape(mpsShape);
auto storage_numel = src.storage().nbytes() / src.element_size() - src.storage_offset();
auto storage_numel = src.storage().nbytes() / src.element_size();
TORCH_CHECK(storage_numel <= std::numeric_limits<int32_t>::max(),
"MPSGaph does not support tensor dims larger than INT_MAX");
MPSNDArrayDescriptor* srcTensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:dataType

View File

@ -1,25 +0,0 @@
#pragma once
#include <c10/metal/common.h>
#ifdef __METAL__
enum class EmbeddingBagMode { SUM = 0, MEAN, MAX };
#else
#include <ATen/native/EmbeddingBag.h>
using at::native::EmbeddingBagMode;
#endif
template <typename idx_type_t = uint32_t>
struct EmbeddingBagParams {
::c10::metal::array<idx_type_t, 2> weight_strides;
::c10::metal::array<idx_type_t, 2> output_strides;
::c10::metal::array<idx_type_t, 2> max_indices_strides;
idx_type_t per_sample_weights_strides;
idx_type_t num_indices;
idx_type_t num_bags;
idx_type_t feature_size;
EmbeddingBagMode mode;
int64_t padding_idx;
};

View File

@ -1,212 +0,0 @@
#include <ATen/native/mps/kernels/EmbeddingBag.h>
#include <c10/metal/utils.h>
#include <metal_array>
#include <metal_stdlib>
using namespace metal;
using namespace c10::metal;
template <EmbeddingBagMode M, typename T>
struct ReductionOpInit {
inline opmath_t<T> operator()() {
return 0;
}
};
template <typename T>
struct ReductionOpInit<EmbeddingBagMode::MAX, T> {
inline opmath_t<T> operator()() {
return static_cast<opmath_t<T>>(-INFINITY);
}
};
template <EmbeddingBagMode M, typename T>
struct ReductionOp {
inline opmath_t<T> operator()(
T weight_val,
opmath_t<T> out_val,
uint32_t per_sample_weights_index,
constant T* per_sample_weights,
uint32_t per_sample_weights_strides);
};
template <typename T>
struct ReductionOp<EmbeddingBagMode::SUM, T> {
inline opmath_t<T> operator()(
T weight_val,
opmath_t<T> out_val,
uint32_t per_sample_weights_index,
constant T* per_sample_weights,
uint32_t per_sample_weights_strides) {
if (per_sample_weights_strides) {
T per_sample_weight = per_sample_weights
[per_sample_weights_strides * per_sample_weights_index];
return static_cast<opmath_t<T>>(per_sample_weight) *
static_cast<opmath_t<T>>(weight_val) +
out_val;
} else {
return static_cast<opmath_t<T>>(weight_val) + out_val;
}
}
};
template <typename T>
struct ReductionOp<EmbeddingBagMode::MEAN, T> {
inline opmath_t<T> operator()(
T weight_val,
opmath_t<T> out_val,
uint32_t,
constant T*,
uint32_t) {
return static_cast<opmath_t<T>>(weight_val) + out_val;
}
};
template <typename T>
struct ReductionOp<EmbeddingBagMode::MAX, T> {
inline opmath_t<T> operator()(
T weight_val,
opmath_t<T> out_val,
uint32_t,
constant T*,
uint32_t) {
return max(static_cast<opmath_t<T>>(weight_val), out_val);
}
};
template <EmbeddingBagMode M, typename T>
struct ReductionOpFinal {
inline T operator()(opmath_t<T> val, uint32_t) {
return static_cast<T>(val);
}
};
template <typename T>
struct ReductionOpFinal<EmbeddingBagMode::MEAN, T> {
inline T operator()(opmath_t<T> val, uint32_t count) {
auto out = val / count;
return static_cast<T>((count == 0) ? 0 : out);
}
};
template <typename T>
struct ReductionOpFinal<EmbeddingBagMode::MAX, T> {
inline T operator()(opmath_t<T> val, uint32_t count) {
return static_cast<T>((count == 0) ? 0 : val);
}
};
template <EmbeddingBagMode M, typename T, typename I>
void embedding_bag_impl(
constant T* weight,
constant I* indices,
constant I* offsets,
constant T* per_sample_weights,
device T* output,
device I* offset2bag,
device I* bag_size,
device I* max_indices,
constant EmbeddingBagParams<uint32_t>& params,
uint tid) {
auto num_indices = params.num_indices;
auto num_bags = params.num_bags;
auto feature_size = params.feature_size;
auto padding_idx = params.padding_idx;
auto per_sample_weights_strides = params.per_sample_weights_strides;
constant auto& output_strides = params.output_strides;
constant auto& weight_strides = params.weight_strides;
constant auto& max_indices_strides = params.max_indices_strides;
auto bag_idx = tid / feature_size;
auto feature_idx = tid % feature_size;
output += bag_idx * output_strides[0] + feature_idx * output_strides[1];
uint32_t offsets_end = min(bag_idx + 1, num_bags - 1);
bool is_last_bag = bag_idx + 1 == num_bags;
uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]);
uint32_t indices_end = is_last_bag * (num_indices) +
(!is_last_bag) * (static_cast<uint32_t>(offsets[offsets_end]));
auto out_val = ReductionOpInit<M, T>()();
uint32_t bag_size_ = 0;
for (uint32_t indices_idx = indices_start; indices_idx < indices_end;
indices_idx++) {
I weight_idx = indices[indices_idx];
bool pad = (weight_idx == padding_idx);
T weight_val = weight
[static_cast<uint32_t>(weight_idx) * weight_strides[0] +
feature_idx * weight_strides[1]];
bag_size_ += static_cast<uint32_t>(!pad);
auto tmp_val = ReductionOp<M, T>()(
weight_val,
out_val,
indices_idx,
per_sample_weights,
per_sample_weights_strides);
out_val = pad ? out_val : tmp_val;
}
*output = ReductionOpFinal<M, T>()(out_val, bag_size_);
}
#define DISPATCH_IMPL(MODE) \
return embedding_bag_impl<MODE>( \
weight, \
indices, \
offsets, \
per_sample_weights, \
output, \
offset2bag, \
bag_size, \
max_indices, \
params, \
tid)
template <typename T, typename I>
kernel void embedding_bag(
constant T* weight [[buffer(0)]],
constant I* indices [[buffer(1)]],
constant I* offsets [[buffer(2)]],
constant T* per_sample_weights [[buffer(3)]],
device T* output [[buffer(4)]],
device I* offset2bag [[buffer(5)]],
device I* bag_size [[buffer(6)]],
device I* max_indices [[buffer(7)]],
constant EmbeddingBagParams<uint32_t>& params [[buffer(8)]],
uint tid [[thread_position_in_grid]]) {
switch (params.mode) {
case EmbeddingBagMode::SUM:
DISPATCH_IMPL(EmbeddingBagMode::SUM);
case EmbeddingBagMode::MEAN:
DISPATCH_IMPL(EmbeddingBagMode::MEAN);
case EmbeddingBagMode::MAX:
DISPATCH_IMPL(EmbeddingBagMode::MAX);
}
}
#define REGISTER_EMBEDDING_BAG_OP(T, I) \
template [[host_name("embedding_bag_" #T "_" #I)]] \
kernel void embedding_bag<T, I>( \
constant T * weight [[buffer(0)]], \
constant I * indices [[buffer(1)]], \
constant I * offsets [[buffer(2)]], \
constant T * per_sample_weights [[buffer(3)]], \
device T * output [[buffer(4)]], \
device I * offset2bag [[buffer(5)]], \
device I * bag_size [[buffer(6)]], \
device I * max_indices [[buffer(7)]], \
constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]], \
uint tid [[thread_position_in_grid]]);
REGISTER_EMBEDDING_BAG_OP(float, int);
REGISTER_EMBEDDING_BAG_OP(float, long);
REGISTER_EMBEDDING_BAG_OP(half, int);
REGISTER_EMBEDDING_BAG_OP(half, long);
REGISTER_EMBEDDING_BAG_OP(bfloat, int);
REGISTER_EMBEDDING_BAG_OP(bfloat, long);

View File

@ -8,6 +8,8 @@
#include <ATen/native/TensorIterator.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/native/mps/operations/BinaryKernel.h>
// For MTLLanguageVersion_3_1
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <fmt/format.h>
#ifndef AT_PER_OPERATOR_HEADERS

View File

@ -1,12 +1,23 @@
// Copyright © 2022 Apple Inc.
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/ConvUtils.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/ops/_mps_convolution_native.h>
#include <ATen/ops/_mps_convolution_transpose_native.h>
#include <ATen/ops/mps_convolution_backward_native.h>
#include <ATen/ops/mps_convolution_transpose_backward_native.h>
#include <fmt/format.h>
#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
@implementation FakeMPSGraphConvolution3DOpDescriptor
- (nonnull id)copyWithZone:(nullable NSZone*)zone {
return self;
}
@end
#endif
namespace at::native {
@ -39,9 +50,11 @@ static void fill_conv3d_desc(MPSGraphConvolution3DOpDescriptor* descriptor_,
descriptor_.paddingFront = paddingDepth;
descriptor_.paddingBack = paddingDepth;
descriptor_.dataLayout = MPSGraphTensorNamedDataLayoutNCDHW;
// PyTorch always uses NCDHW memory layout for 3D tensors
descriptor_.dataLayout = (MPSGraphTensorNamedDataLayout)7L; // MPSGraphTensorNamedDataLayoutNCDHW;
descriptor_.weightsLayout = MPSGraphTensorNamedDataLayoutOIDHW;
// PyTorch always uses OIDHW memory layout for 3D weights
descriptor_.weightsLayout = (MPSGraphTensorNamedDataLayout)9L; // MPSGraphTensorNamedDataLayoutOIDHW;
descriptor_.groups = groups; // not yet tested in Xcode/C++
}
@ -173,6 +186,18 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
if (bias_defined)
bias_shape = bias_opt.value().sizes();
std::string mem_format_key;
switch (memory_format) {
case at::MemoryFormat::Contiguous:
mem_format_key = "Contiguous";
break;
case at::MemoryFormat::ChannelsLast:
mem_format_key = "ChannelsLast";
break;
default:
assert(0 && "Check should have been done earlier\n");
}
std::string bias_shape_key;
if (bias_defined) {
bias_shape_key = std::to_string(bias_shape[0]);
@ -180,16 +205,20 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
bias_shape_key = "nobias";
}
std::string key = fmt::format("mps_{}convolution:{}:{}:{}:{}:{}:{}:{}:{}",
is3DConv ? "3d_" : "",
getArrayRefString(stride),
getArrayRefString(dilation),
getArrayRefString(padding),
groups,
is_channels_last,
mps::getTensorsStringKey({input_t, weight_t}),
bias_defined,
bias_shape_key);
std::string key;
if (is3DConv) {
key = "mps_3d_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
} else {
key = "mps_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
}
MPSShape* inputShape = mps::getMPSShape(input_t, memory_format);
MPSShape* outputShape = mps::getMPSShape(output_t, memory_format);
@ -198,7 +227,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
if (input_t.is_contiguous(memory_format) && output_t.is_contiguous(memory_format) && is_macOS_15_0_or_newer) {
inputNDArray = getMPSNDArray(input_t, inputShape);
outputNDArray = getMPSNDArray(output_t, outputShape);
outputNDArray = getMPSNDArray(*output, outputShape);
}
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
@ -302,7 +331,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
}
}
auto outputPlaceholder = outputNDArray ? Placeholder(cachedGraph->outputTensor_, outputNDArray)
: Placeholder(cachedGraph->outputTensor_, output_t);
: Placeholder(cachedGraph->outputTensor_, *output);
NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
[[[NSMutableDictionary alloc] initWithCapacity:3] autorelease];
@ -315,7 +344,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
}
return output_t;
return *output;
}
Tensor _mps_convolution(const Tensor& input_t,
@ -371,15 +400,33 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
@autoreleasepool {
MPSStream* stream = getCurrentMPSStream();
std::string mem_format_key;
switch (memory_format) {
case at::MemoryFormat::Contiguous:
mem_format_key = "Contiguous";
break;
case at::MemoryFormat::ChannelsLast:
mem_format_key = "ChannelsLast";
break;
default:
assert(0 && "Check should have been done earlier\n");
}
MPSShape* mps_input_shape = getMPSShape(input_size);
std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}",
is3DConv ? "3d_" : "",
getArrayRefString(stride),
getArrayRefString(dilation),
getArrayRefString(padding),
groups,
is_channels_last,
getTensorsStringKey({grad_output_t, weight_t}));
std::string key;
if (is3DConv) {
key = "mps_3d_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
":" + std::to_string(stride[2]) + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
getTensorsStringKey({grad_output_t, weight_t});
} else {
key = "mps_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
getTensorsStringKey({grad_output_t, weight_t});
}
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
auto gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t);
auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
@ -504,13 +551,19 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
MPSStream* stream = getCurrentMPSStream();
MPSShape* mps_weight_shape = getMPSShape(weight_size);
std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}",
is3DConv ? "3d_" : "",
getArrayRefString(stride),
getArrayRefString(dilation),
getArrayRefString(padding),
groups,
getTensorsStringKey({grad_output_t, input_t, grad_weight_t}));
std::string key;
if (is3DConv) {
key = "mps_3d_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" +
getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
} else {
key = "mps_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" +
getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
}
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
MPSShape* inputShape = getMPSShape(input_t);
bool isDepthwiseConv =

View File

@ -2,6 +2,7 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/mps/MPSProfiler.h>
#include <ATen/native/mps/Copy.h>
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/ops/_copy_from_and_resize_native.h>
#include <ATen/ops/_copy_from_native.h>

View File

@ -5,6 +5,8 @@
#include <ATen/native/DistributionTemplates.h>
#include <ATen/native/Distributions.h>
#include <ATen/native/TensorFactories.h>
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS

View File

@ -1,179 +0,0 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/TensorUtils.h>
#include <ATen/core/Tensor.h>
#include <ATen/mps/MPSProfiler.h>
#include <ATen/native/EmbeddingBag.h>
#include <ATen/native/Pool.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/native/mps/kernels/EmbeddingBag.h>
#include <fmt/format.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/_embedding_bag_forward_only_native.h>
#include <ATen/ops/_embedding_bag_native.h>
#include <ATen/ops/empty.h>
#endif
namespace at::native {
#ifndef PYTORCH_JIT_COMPILE_SHADERS
static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
#else
#include <ATen/native/mps/EmbeddingBag_metallib.h>
#endif
namespace {
std::pair<Tensor, Tensor> promoteIndicesAndOffsets(const Tensor& indices, const Tensor& offsets) {
const auto commonType = promoteTypes(offsets.scalar_type(), indices.scalar_type());
return {indices.scalar_type() == commonType ? indices : indices.toType(commonType),
offsets.scalar_type() == commonType ? offsets : offsets.toType(commonType)};
}
} // namespace
namespace mps {
static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
const Tensor& weight,
const Tensor& indices_,
const Tensor& offsets_,
const bool scale_grad_by_freq,
const int64_t mode,
bool sparse,
const std::optional<Tensor>& per_sample_weights_opt,
bool include_last_offset,
int64_t padding_idx) {
TORCH_CHECK(indices_.dim() == 1, "input has to be a 1D Tensor, but got Tensor of dimension ", indices_.dim());
if (indices_.dim() == 1) {
TORCH_CHECK(offsets_.dim() == 1, "offsets has to be a 1D Tensor, but got Tensor of dimension ", offsets_.dim());
}
TORCH_CHECK(weight.dim() == 2, "weight has to be a 2D Tensor, but got Tensor of dimension ", weight.dim());
Tensor indices, offsets;
std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_);
auto indices_arg = TensorArg(indices, "indices", 1);
checkScalarTypes("embedding_bag_mps", indices_arg, {kLong, kInt});
auto offsets_arg = TensorArg(offsets, "offsets", 1);
checkScalarTypes("embedding_bag_mps", offsets_arg, {kLong, kInt});
checkSameType("embedding_bag_mps", indices_arg, offsets_arg);
auto weight_arg = TensorArg(weight, "weight", 1);
int64_t num_indices = indices.size(0);
int64_t num_bags = offsets.size(0);
if (include_last_offset) {
num_bags -= 1;
}
int64_t feature_size = weight.size(1);
auto bag_size = at::empty(offsets.sizes(), indices.options());
auto offset2bag = at::empty({indices.size(0)}, indices.options());
auto output = at::empty({num_bags, feature_size}, weight.options());
Tensor max_indices;
if (mode == EmbeddingBagMode::MAX) {
max_indices = at::empty({num_bags, feature_size}, indices.options());
} else {
max_indices = at::empty({0}, indices.options());
}
EmbeddingBagParams<uint32_t> params;
for (const auto dim : c10::irange(weight.dim())) {
params.weight_strides[dim] = safe_downcast<uint32_t, int64_t>(weight.stride(dim));
params.output_strides[dim] = safe_downcast<uint32_t, int64_t>(output.stride(dim));
if (mode == EmbeddingBagMode::MAX) {
params.max_indices_strides[dim] = safe_downcast<uint32_t, int64_t>(max_indices.stride(dim));
}
}
bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
params.per_sample_weights_strides = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
params.num_indices = num_indices;
params.num_bags = num_bags;
params.feature_size = feature_size;
params.mode = static_cast<EmbeddingBagMode>(mode);
params.padding_idx = padding_idx;
auto num_threads = output.numel();
MPSStream* stream = getCurrentMPSStream();
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
auto pipeline_state = lib.getPipelineStateForFunc(
fmt::format("embedding_bag_{}_{}", scalarToMetalTypeString(weight), scalarToMetalTypeString(indices)));
getMPSProfiler().beginProfileKernel(pipeline_state, "embedding_bag", {weight, indices, offsets});
[computeEncoder setComputePipelineState:pipeline_state];
mtl_setArgs(computeEncoder,
weight,
indices,
offsets,
use_per_sample_weights ? per_sample_weights_opt : std::nullopt,
output,
offset2bag,
bag_size,
max_indices,
params);
mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
getMPSProfiler().endProfileKernel(pipeline_state);
}
});
return std::tuple<Tensor, Tensor, Tensor, Tensor>(
std::move(output), std::move(offset2bag), std::move(bag_size), std::move(max_indices));
}
} // namespace mps
std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps(const Tensor& weight,
const Tensor& indices,
const Tensor& offsets,
const bool scale_grad_by_freq,
const int64_t mode,
bool sparse,
const std::optional<Tensor>& per_sample_weights_opt,
bool include_last_offset,
int64_t padding_idx) {
return mps::_embedding_bag_mps_impl(weight,
indices,
offsets,
scale_grad_by_freq,
mode,
sparse,
per_sample_weights_opt,
include_last_offset,
padding_idx);
}
std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_forward_only_mps(
const Tensor& weight,
const Tensor& indices,
const Tensor& offsets,
const bool scale_grad_by_freq,
const int64_t mode,
bool sparse,
const std::optional<Tensor>& per_sample_weights_opt,
bool include_last_offset,
int64_t padding_idx) {
return _embedding_bag_mps(weight,
indices,
offsets,
scale_grad_by_freq,
mode,
sparse,
per_sample_weights_opt,
include_last_offset,
padding_idx);
}
} // namespace at::native

View File

@ -1,4 +1,6 @@
#include <ATen/native/SpectralOpsUtils.h>
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS
@ -10,6 +12,20 @@
#include <ATen/ops/_fft_r2c_native.h>
#endif
#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
@implementation FakeMPSGraphFFTDescriptor
+ (nullable instancetype)descriptor {
// Redispatch the constructor to the actual implementation
id desc = NSClassFromString(@"MPSGraphFFTDescriptor");
return (FakeMPSGraphFFTDescriptor*)[desc descriptor];
}
- (nonnull id)copyWithZone:(nullable NSZone*)zone {
return self;
}
@end
#endif
namespace at::native {
namespace {
MPSGraphFFTScalingMode normalization_to_ScalingMode(int64_t normalization) {

View File

@ -2,6 +2,7 @@
#include <ATen/mps/MPSProfiler.h>
#include <ATen/native/GridSamplerUtils.h>
#include <ATen/native/Pool.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/native/mps/kernels/GridSampler.h>

View File

@ -17,6 +17,7 @@
#include <ATen/native/LinearAlgebraUtils.h>
#include <ATen/native/Resize.h>
#include <ATen/native/TensorAdvancedIndexing.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <c10/util/SmallVector.h>
#include <c10/util/irange.h>
#include <fmt/format.h>

View File

@ -6,7 +6,9 @@
#include <ATen/native/LinearAlgebra.h>
#include <ATen/native/LinearAlgebraUtils.h>
#include <ATen/native/Resize.h>
// For MTLLanguageVersion_3_1
#include <ATen/native/mps/MPSGraphSequoiaOps.h>
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS
@ -20,7 +22,6 @@
#include <ATen/ops/baddbmm_native.h>
#include <ATen/ops/bmm_native.h>
#include <ATen/ops/cholesky_native.h>
#include <ATen/ops/eye_native.h>
#include <ATen/ops/linalg_cholesky_ex_native.h>
#include <ATen/ops/linalg_inv_ex_native.h>
#include <ATen/ops/linalg_lu_factor_ex_native.h>
@ -497,24 +498,26 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
using namespace mps;
TORCH_CHECK(result.is_mps(), "Output tensor is not MPS");
TORCH_CHECK(!A.is_complex(), "linalg_inv: not supported for complex types yet!");
using CachedGraph = MPSUnaryCachedGraph;
MPSStream* stream = getCurrentMPSStream();
info.zero_();
if (A.numel() == 0) {
return;
}
if (!result.is_contiguous()) {
result.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
}
auto A_sizes = A.sizes();
int ndim = A.dim();
Tensor LU = empty_like(A, MemoryFormat::Contiguous);
Tensor identity = eye(A.size(-2), A.size(-1), A.scalar_type(), A.options().layout(), A.device()).expand_as(A);
Tensor LU = empty_like(A);
Tensor identity = zeros_like(A);
Tensor pivots = empty({A_sizes.begin(), A_sizes.end() - 1}, A.options().dtype(kInt));
// need to do this to keep the strides of the result tensor
// mps's solve expects row major layout, while inductor
// expects result to be column major
Tensor tmp = empty_like(A, MemoryFormat::Contiguous);
linalg_solve_out_mps_impl(A, identity, true, check_errors, tmp, LU, pivots, info);
result.copy_(tmp);
(ndim == 2 ? identity.diagonal() : identity.diagonal(0, -2, -1)).fill_(1);
linalg_solve_out_mps_impl(A, identity, true, check_errors, result, LU, pivots, info);
}
static Tensor& mm_out_mps_impl(const Tensor& self, const Tensor& other, Tensor& output) {

View File

@ -519,13 +519,6 @@ static void max_unpool_out_mps_template(const Tensor& input,
Tensor& output,
const int32_t pooling_dims,
const std::string& op_name) {
TORCH_CHECK(output_size_.size() == static_cast<size_t>(pooling_dims),
op_name,
"There should be exactly ",
pooling_dims,
" elements but got ",
output_size_.size());
auto dims = input.dim();
auto leading_dims = input.dim() - pooling_dims;
@ -541,18 +534,6 @@ static void max_unpool_out_mps_template(const Tensor& input,
output.resize_(output_size, memory_format);
output.fill_(0);
if (indices.defined() && indices.numel() > 0) {
auto output_image_size = c10::multiply_integers(output_size_);
int64_t min_idx = indices.min().item<int64_t>();
int64_t max_idx = indices.max().item<int64_t>();
if (min_idx < 0 || max_idx >= output_image_size) {
int64_t error_idx = (min_idx < 0) ? min_idx : max_idx;
TORCH_CHECK(false, "Found an invalid max index: ", error_idx, " for output tensor of shape ", output_size_);
}
}
id<MTLDevice> device = MPSDevice::getInstance()->device();
MPSStream* mpsStream = getCurrentMPSStream();
const auto numThreads = input.numel();

Some files were not shown because too many files have changed in this diff Show More