Compare commits

..

3 Commits

600 changed files with 3930 additions and 7324 deletions

View File

@ -20,7 +20,7 @@ ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
# cmake-3.18.4 from pip
RUN yum install -y python3-pip && \
python3 -mpip install cmake==3.18.4 && \
python3 -m pip install cmake==3.18.4 && \
ln -s /usr/local/bin/cmake /usr/bin/cmake3
RUN rm -rf /usr/local/cuda-*

View File

@ -83,6 +83,10 @@ function build_cpython {
py_suffix=${py_ver::-1}
py_folder=$py_suffix
fi
# Update to rc2 due to https://github.com/python/cpython/commit/c72699086fe4
if [ "$py_suffix" == "3.14.0" ]; then
py_suffix="3.14.0rc2"
fi
wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz
do_cpython_build $py_ver Python-$py_suffix

View File

@ -25,7 +25,7 @@ function install_torchbench() {
python install.py --continue_on_fail
echo "Print all dependencies after TorchBench is installed"
python -mpip freeze
python -m pip freeze
popd
chown -R jenkins torchbench

View File

@ -8,8 +8,8 @@ MKLROOT=/opt/intel
mkdir -p ${MKLROOT}
pushd /tmp
python3 -mpip install wheel
python3 -mpip download -d . mkl-static==${MKL_VERSION}
python3 -m pip install wheel
python3 -m pip download -d . mkl-static==${MKL_VERSION}
python3 -m wheel unpack mkl_static-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl
python3 -m wheel unpack mkl_include-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl
mv mkl_static-${MKL_VERSION}/mkl_static-${MKL_VERSION}.data/data/lib ${MKLROOT}

View File

@ -11,5 +11,5 @@ ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
python -m venv /var/lib/jenkins/ci_env
source /var/lib/jenkins/ci_env/bin/activate
python -mpip install --upgrade pip
python -mpip install -r /opt/requirements-ci.txt
python -m pip install --upgrade pip
python -m pip install -r /opt/requirements-ci.txt

View File

@ -39,13 +39,9 @@ case ${DOCKER_TAG_PREFIX} in
DOCKER_GPU_BUILD_ARG=""
;;
rocm*)
# we want the patch version of 7.0 instead
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
fi
# we want the patch version of 6.4 instead
if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4"
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
fi
BASE_TARGET=rocm
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete

View File

@ -14,7 +14,7 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/op
# cmake-3.18.4 from pip
RUN yum install -y python3-pip && \
python3 -mpip install cmake==3.18.4 && \
python3 -m pip install cmake==3.18.4 && \
ln -s /usr/local/bin/cmake /usr/bin/cmake3
FROM base as openssl
@ -135,7 +135,7 @@ RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
# cmake-3.18.4 from pip; force in case cmake3 already exists
RUN yum install -y python3-pip && \
python3 -mpip install cmake==3.18.4 && \
python3 -m pip install cmake==3.18.4 && \
ln -sf /usr/local/bin/cmake /usr/bin/cmake3
FROM cpu_final as cuda_final
@ -157,7 +157,7 @@ ENV ROCM_PATH /opt/rocm
# cmake-3.28.4 from pip to get enable_language(HIP)
# and avoid 3.21.0 cmake+ninja issues with ninja inserting "-Wl,--no-as-needed" in LINK_FLAGS for static linker
RUN python3 -m pip install --upgrade pip && \
python3 -mpip install cmake==3.28.4
python3 -m pip install cmake==3.28.4
# replace the libdrm in /opt/amdgpu with custom amdgpu.ids lookup path
ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
@ -174,7 +174,7 @@ FROM cpu_final as xpu_final
ENV XPU_DRIVER_TYPE ROLLING
# cmake-3.28.4 from pip
RUN python3 -m pip install --upgrade pip && \
python3 -mpip install cmake==3.28.4
python3 -m pip install cmake==3.28.4
ADD ./common/install_xpu.sh install_xpu.sh
ENV XPU_VERSION 2025.2
RUN bash ./install_xpu.sh && rm install_xpu.sh

View File

@ -113,7 +113,7 @@ RUN dnf install -y \
RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
# cmake-3.28.0 from pip for onnxruntime
RUN python3 -mpip install cmake==3.28.0
RUN python3 -m pip install cmake==3.28.0
ADD ./common/patch_libstdc.sh patch_libstdc.sh
RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh

View File

@ -75,13 +75,9 @@ case ${image} in
DOCKERFILE_SUFFIX="_cuda_aarch64"
;;
manylinux2_28-builder:rocm*)
# we want the patch version of 7.0 instead
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
fi
# we want the patch version of 6.4 instead
if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4"
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
fi
TARGET=rocm_final
MANY_LINUX_VERSION="2_28"

View File

@ -57,8 +57,8 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
logger.info("Successfully cloned %s", target)
return r, commit
except GitCommandError:
logger.exception("Git operation failed")
except GitCommandError as e:
logger.error("Git operation failed: %s", e)
raise

View File

@ -288,7 +288,7 @@ else
# or building non-XLA tests.
if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *xla* && "$BUILD_ENVIRONMENT" != *riscv64* ]]; then
# Install numpy-2.0.2 for builds which are backward compatible with 1.X
python -mpip install numpy==2.0.2
python -m pip install numpy==2.0.2
WERROR=1 python setup.py clean

View File

@ -67,13 +67,13 @@ function pip_install_whl() {
# Loop through each path and install individually
for path in "${paths[@]}"; do
echo "Installing $path"
python3 -mpip install --no-index --no-deps "$path"
python3 -m pip install --no-index --no-deps "$path"
done
else
# Loop through each argument and install individually
for path in "${args[@]}"; do
echo "Installing $path"
python3 -mpip install --no-index --no-deps "$path"
python3 -m pip install --no-index --no-deps "$path"
done
fi
}

View File

@ -182,7 +182,7 @@ checkout_install_torchbench() {
pip uninstall -y torchao
echo "Print all dependencies after TorchBench is installed"
python -mpip freeze
python -m pip freeze
}
torchbench_setup_macos() {
@ -211,7 +211,7 @@ torchbench_setup_macos() {
}
pip_benchmark_deps() {
python -mpip install --no-input requests cython scikit-learn six
python -m pip install --no-input requests cython scikit-learn six
}

View File

@ -1434,7 +1434,7 @@ EOF
# shellcheck source=./common-build.sh
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
python -m build --wheel --no-isolation -C--build-option=--bdist-dir="base_bdist_tmp" --outdir "base_dist"
python -mpip install base_dist/*.whl
python -m pip install base_dist/*.whl
echo "::endgroup::"
pushd test/forward_backward_compatibility

View File

@ -173,7 +173,7 @@ esac
PINNED_PACKAGES=(
"numpy${NUMPY_PINNED_VERSION}"
)
python -mvenv ~/${desired_python}-build
python -m venv ~/${desired_python}-build
source ~/${desired_python}-build/bin/activate
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
retry brew install libomp

View File

@ -163,13 +163,8 @@ if [[ "$(uname)" != Darwin ]]; then
MEMORY_LIMIT_MAX_JOBS=12
NUM_CPUS=$(( $(nproc) - 2 ))
if [[ "$(uname)" == Linux ]]; then
# Defaults here for **binary** linux builds so they can be changed in one place
export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))}
else
# For other builds
export MAX_JOBS=${NUM_CPUS}
fi
# Defaults here for **binary** linux builds so they can be changed in one place
export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))}
cat >>"$envfile" <<EOL
export MAX_JOBS="${MAX_JOBS}"

View File

@ -7,12 +7,16 @@ max-line-length = 120
# C408 ignored because we like the dict keyword argument syntax
# E501 is not flexible enough, we're using B950 instead
ignore =
E203,E305,E402,E501,E704,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
# to line this up with executable bit
EXE001,
# these ignores are from flake8-bugbear; please fix!
B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910
# these ignores are from flake8-comprehensions; please fix!
C407,
# these ignores are from flake8-logging-format; please fix!
G100,G101,G200
# these ignores are from flake8-simplify. please fix or ignore with commented reason
SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
# SIM104 is already covered by pyupgrade ruff

View File

@ -1 +1 @@
69bbe7363897764f9e758d851cd0340147d27f94
1b013f5b5a87a1882eb143c26d79d091150d6a37

29
.github/labeler.yml vendored
View File

@ -133,32 +133,3 @@
"ciflow/vllm":
- .github/ci_commit_pins/vllm.txt
"ciflow/b200":
- test/test_matmul_cuda.py
- test/test_scaled_matmul_cuda.py
- test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
- torch/**/*cublas*
- torch/_inductor/kernel/mm.py
- test/inductor/test_max_autotune.py
- third_party/fbgemm
"ciflow/h100":
- test/test_matmul_cuda.py
- test/test_scaled_matmul_cuda.py
- test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
- torch/**/*cublas*
- torch/_inductor/kernel/mm.py
- test/inductor/test_max_autotune.py
- third_party/fbgemm
"ciflow/rocm":
- test/test_matmul_cuda.py
- test/test_scaled_matmul_cuda.py
- test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
- torch/_inductor/kernel/mm.py
- test/inductor/test_max_autotune.py
- third_party/fbgemm

View File

@ -33,7 +33,6 @@ ciflow_push_tags:
- ciflow/rocm
- ciflow/rocm-mi300
- ciflow/rocm-mi355
- ciflow/rocm-navi31
- ciflow/s390
- ciflow/slow
- ciflow/torchbench

View File

@ -79,21 +79,21 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
),
"12.9": (
"nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | "
"nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | "
"nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | "
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
"nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | "
"nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | "
"nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | "
"nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | "
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
"nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
),
"13.0": (
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "

View File

@ -24,7 +24,7 @@ change_wheel_version() {
local t_version=$4
# Extract the wheel
${PYTHON_EXECUTABLE} -mwheel unpack $wheel
${PYTHON_EXECUTABLE} -m wheel unpack $wheel
mv "${package}-${f_version}" "${package}-${t_version}"
# Change the version from f_version to t_version in the dist-info dir
@ -47,7 +47,7 @@ change_wheel_version() {
popd
# Repack the wheel
${PYTHON_EXECUTABLE} -mwheel pack "${package}-${t_version}"
${PYTHON_EXECUTABLE} -m wheel pack "${package}-${t_version}"
# Clean up
rm -rf "${package}-${t_version}"
@ -85,7 +85,7 @@ repackage_wheel() {
}
# Require to re-package the wheel
${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1
${PYTHON_EXECUTABLE} -m pip install wheel==0.45.1
pushd externals/vllm/wheels
for package in xformers flashinfer-python vllm; do

View File

@ -26,8 +26,9 @@ name: !{{ build_environment }}
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "!{{ py_ver.strip('t') + ('.4' if '3.14' not in py_ver else '.0') }}"
python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}"
freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }}
{%- endmacro %}

View File

@ -79,9 +79,9 @@ jobs:
runs-on: "windows-11-arm64-preview"
{%- else %}
{%- if branches == "nightly" %}
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
{%- else %}
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge.nonephemeral"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
{%- endif %}
{%- endif %}
timeout-minutes: !{{ common.timeout_minutes_windows_binary }}

View File

@ -211,7 +211,7 @@ jobs:
$tool --version
done
python3 -mpip install --no-index --no-deps dist/*.whl
python3 -m pip install --no-index --no-deps dist/*.whl
set +e
pushd "${RUNNER_TEMP}"
@ -222,7 +222,7 @@ jobs:
popd
if [ "${RC}" -ne 0 ]; then
python3 -mpip install --ignore-installed -r "${PIP_REQUIREMENTS_FILE}"
python3 -m pip install --ignore-installed -r "${PIP_REQUIREMENTS_FILE}"
fi
set -e

View File

@ -204,7 +204,7 @@ jobs:
run: |
pushd "${PYTORCH_FINAL_PACKAGE_DIR}"
# shellcheck disable=SC2046,SC2102
python3 -mpip install $(echo *.whl)[opt-einsum,optree] optree==0.13.0
python3 -m pip install $(echo *.whl)[opt-einsum,optree] optree==0.13.0
popd
.ci/pytorch/win-test.sh

View File

@ -126,13 +126,13 @@ jobs:
"${MANYLINUX_IMAGE}"
)
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip install \
--pre torch torchvision torchaudio \
--index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
# I wonder if there is a command to both download and install the wheels
# in one go
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip download \
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -m pip download \
--pre torch torchvision torchaudio \
--index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"

View File

@ -224,7 +224,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -473,7 +473,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -722,7 +722,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -971,7 +971,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1220,7 +1220,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1469,7 +1469,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1718,7 +1718,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_9
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -259,7 +259,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_9-test: # Testing
@ -925,7 +925,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_9-test: # Testing
@ -1591,7 +1591,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_9-test: # Testing
@ -2257,7 +2257,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda12_9-test: # Testing
@ -2923,7 +2923,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda12_9-test: # Testing
@ -3589,7 +3589,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14-cuda12_9-test: # Testing
@ -4255,7 +4255,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_14t-cuda12_9
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_14t-cuda12_9-test: # Testing

View File

@ -63,6 +63,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.10.4"
freethreaded: false

View File

@ -59,6 +59,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.10.4"
freethreaded: false
@ -105,7 +106,7 @@ jobs:
SMOKE_TEST_PARAMS=""
# shellcheck disable=SC2086
python -mvenv test_venv
python -m venv test_venv
source test_venv/bin/activate
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
@ -168,6 +169,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.11.4"
freethreaded: false
@ -214,7 +216,7 @@ jobs:
SMOKE_TEST_PARAMS=""
# shellcheck disable=SC2086
python -mvenv test_venv
python -m venv test_venv
source test_venv/bin/activate
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
@ -277,6 +279,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.12.4"
freethreaded: false
@ -323,7 +326,7 @@ jobs:
SMOKE_TEST_PARAMS=""
# shellcheck disable=SC2086
python -mvenv test_venv
python -m venv test_venv
source test_venv/bin/activate
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
@ -386,6 +389,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.13.4"
freethreaded: false
@ -432,7 +436,7 @@ jobs:
SMOKE_TEST_PARAMS=""
# shellcheck disable=SC2086
python -mvenv test_venv
python -m venv test_venv
source test_venv/bin/activate
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
@ -495,6 +499,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.13.4"
freethreaded: true
@ -541,7 +546,7 @@ jobs:
SMOKE_TEST_PARAMS=""
# shellcheck disable=SC2086
python -mvenv test_venv
python -m venv test_venv
source test_venv/bin/activate
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
@ -604,8 +609,9 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.14.0"
python-version: "3.14.0-rc.2"
freethreaded: false
- name: Checkout PyTorch
uses: actions/checkout@v4
@ -650,7 +656,7 @@ jobs:
SMOKE_TEST_PARAMS=""
# shellcheck disable=SC2086
python -mvenv test_venv
python -m venv test_venv
source test_venv/bin/activate
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
@ -713,8 +719,9 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v6
with:
# TODO: Removeme once 3.14 is out
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
python-version: "3.14.0"
python-version: "3.14.0-rc.2"
freethreaded: true
- name: Checkout PyTorch
uses: actions/checkout@v4
@ -759,7 +766,7 @@ jobs:
SMOKE_TEST_PARAMS=""
# shellcheck disable=SC2086
python -mvenv test_venv
python -m venv test_venv
source test_venv/bin/activate
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

View File

@ -44,7 +44,7 @@ jobs:
libtorch-cpu-shared-with-deps-debug-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -291,7 +291,7 @@ jobs:
libtorch-cuda12_6-shared-with-deps-debug-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -541,7 +541,7 @@ jobs:
libtorch-cuda12_8-shared-with-deps-debug-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -791,7 +791,7 @@ jobs:
libtorch-cuda13_0-shared-with-deps-debug-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch

View File

@ -44,7 +44,7 @@ jobs:
libtorch-cpu-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -291,7 +291,7 @@ jobs:
libtorch-cuda12_6-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -541,7 +541,7 @@ jobs:
libtorch-cuda12_8-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -791,7 +791,7 @@ jobs:
libtorch-cuda13_0-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch

View File

@ -44,7 +44,7 @@ jobs:
wheel-py3_10-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -279,7 +279,7 @@ jobs:
wheel-py3_10-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -517,7 +517,7 @@ jobs:
wheel-py3_10-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -755,7 +755,7 @@ jobs:
wheel-py3_10-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -993,7 +993,7 @@ jobs:
wheel-py3_10-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1229,7 +1229,7 @@ jobs:
wheel-py3_11-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1464,7 +1464,7 @@ jobs:
wheel-py3_11-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1702,7 +1702,7 @@ jobs:
wheel-py3_11-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1940,7 +1940,7 @@ jobs:
wheel-py3_11-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2178,7 +2178,7 @@ jobs:
wheel-py3_11-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2414,7 +2414,7 @@ jobs:
wheel-py3_12-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2649,7 +2649,7 @@ jobs:
wheel-py3_12-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2887,7 +2887,7 @@ jobs:
wheel-py3_12-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3125,7 +3125,7 @@ jobs:
wheel-py3_12-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3363,7 +3363,7 @@ jobs:
wheel-py3_12-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3599,7 +3599,7 @@ jobs:
wheel-py3_13-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3834,7 +3834,7 @@ jobs:
wheel-py3_13-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -4072,7 +4072,7 @@ jobs:
wheel-py3_13-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -4310,7 +4310,7 @@ jobs:
wheel-py3_13-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -4548,7 +4548,7 @@ jobs:
wheel-py3_13-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -4784,7 +4784,7 @@ jobs:
wheel-py3_13t-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5019,7 +5019,7 @@ jobs:
wheel-py3_13t-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5257,7 +5257,7 @@ jobs:
wheel-py3_13t-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5495,7 +5495,7 @@ jobs:
wheel-py3_13t-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5733,7 +5733,7 @@ jobs:
wheel-py3_13t-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -5969,7 +5969,7 @@ jobs:
wheel-py3_14-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -6204,7 +6204,7 @@ jobs:
wheel-py3_14-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -6442,7 +6442,7 @@ jobs:
wheel-py3_14-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -6680,7 +6680,7 @@ jobs:
wheel-py3_14-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -6918,7 +6918,7 @@ jobs:
wheel-py3_14-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -7154,7 +7154,7 @@ jobs:
wheel-py3_14t-cpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -7389,7 +7389,7 @@ jobs:
wheel-py3_14t-cuda12_6-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -7627,7 +7627,7 @@ jobs:
wheel-py3_14t-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -7865,7 +7865,7 @@ jobs:
wheel-py3_14t-cuda13_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -8103,7 +8103,7 @@ jobs:
wheel-py3_14t-xpu-build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: get-label-type
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
timeout-minutes: 360
env:
PYTORCH_ROOT: ${{ github.workspace }}/pytorch

View File

@ -1,63 +0,0 @@
name: rocm-navi31
on:
push:
tags:
- ciflow/rocm-navi31/*
workflow_dispatch:
schedule:
# We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
# Also run less frequently on weekends.
- cron: 45 */2 * * 1-5
- cron: 45 4,12 * * 0,6
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions: read-all
jobs:
target-determination:
if: github.repository_owner == 'pytorch'
name: before-test
uses: ./.github/workflows/target_determination.yml
permissions:
id-token: write
contents: read
linux-jammy-rocm-py3_10-build:
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-rocm-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
]}
secrets: inherit
linux-jammy-rocm-py3_10-test:
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3_10
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
tests-to-include: >-
${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
inductor/test_flex_attention inductor/test_max_autotune' || '' }}
secrets: inherit

View File

@ -59,3 +59,29 @@ jobs:
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-rocm-py3_10-gfx1100-test:
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3_10-gfx1100
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
]}
tests-to-include: >
test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
inductor/test_flex_attention inductor/test_max_autotune
secrets: inherit

View File

@ -190,40 +190,6 @@ jobs:
runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
secrets: inherit
linux-jammy-rocm-py3_10-build:
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-rocm-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
]}
secrets: inherit
linux-jammy-rocm-py3_10-test:
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3.10
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
secrets: inherit
inductor-build:
name: inductor-build
uses: ./.github/workflows/_linux-build.yml

1
.gitignore vendored
View File

@ -374,7 +374,6 @@ third_party/ruy/
third_party/glog/
# Virtualenv
.venv/
venv/
# Log files

View File

@ -201,17 +201,3 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
/torch/csrc/stable/ @janeyx99 @mikaylagawarecki
/torch/headeronly/ @janeyx99
/torch/header_only_apis.txt @janeyx99
# FlexAttention
/torch/nn/attention/flex_attention.py @drisspg
/torch/_higher_order_ops/flex_attention.py @drisspg
/torch/_inductor/kernel/flex/ @drisspg
/torch/_inductor/codegen/cpp_flex_attention_template.py @drisspg
/test/inductor/test_flex_attention.py @drisspg
/test/inductor/test_flex_decoding.py @drisspg
# Low Precision GEMMs
/aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58
/aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
/aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
/test/test_scaled_matmul_cuda.py @drisspg @slayton58

View File

@ -39,7 +39,7 @@ RUN chmod +x ~/miniconda.sh && \
bash ~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh && \
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
/opt/conda/bin/python -mpip install -r requirements.txt && \
/opt/conda/bin/python -m pip install -r requirements.txt && \
/opt/conda/bin/conda clean -ya
FROM dev-base as submodule-update

View File

@ -313,14 +313,13 @@ IF(USE_FBGEMM_GENAI)
# Add additional HIPCC compiler flags for performance
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
-mllvm
-amdgpu-coerce-illegal-types=1
-mllvm
-enable-post-misched=0
-mllvm
-greedy-reverse-local-assignment=1
-fhip-new-launch-api)
if(DEFINED ROCM_VERSION_DEV AND ROCM_VERSION_DEV VERSION_LESS "7.2.0")
list(PREPEND FBGEMM_GENAI_EXTRA_HIPCC_FLAGS -mllvm -amdgpu-coerce-illegal-types=1)
endif()
# Only compile for gfx942 for now.
# This is rather hacky, I could not figure out a clean solution :(

View File

@ -39,7 +39,7 @@ struct HostBlock {
};
template <typename B>
struct alignas(hardware_destructive_interference_size) FreeBlockList {
struct alignas(64) FreeBlockList {
std::mutex mutex_;
std::deque<B*> list_;
};
@ -122,7 +122,7 @@ struct TORCH_API HostStats {
// Struct containing memory allocator summary statistics for host, as they
// are staged for reporting. This is a temporary struct that is used to
// avoid locking the allocator while collecting stats.
struct alignas(hardware_destructive_interference_size) HostStatsStaged {
struct alignas(64) HostStatsStaged {
std::mutex timing_mutex_;
// COUNT: total allocations (active + free)
// LOCK: access to this stat is protected by the allocator's blocks_mutex_
@ -669,7 +669,7 @@ struct CachingHostAllocatorImpl {
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
}
alignas(hardware_destructive_interference_size) std::mutex blocks_mutex_;
alignas(64) std::mutex blocks_mutex_;
ska::flat_hash_set<B*> blocks_; // block list
ska::flat_hash_map<void*, B*> ptr_to_block_;
@ -677,17 +677,17 @@ struct CachingHostAllocatorImpl {
// size. This allows us to quickly find a free block of the right size.
// We use deque to store per size free list and guard the list with its own
// mutex.
alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>> free_list_ =
alignas(64) std::vector<FreeBlockList<B>> free_list_ =
std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
alignas(64) std::mutex events_mutex_;
std::deque<std::pair<E, B*>> events_; // event queue paired with block
// Indicates whether the object is active.
// Set to false in the destructor to signal background threads to stop.
std::atomic<bool> active_{true};
protected:
alignas(hardware_destructive_interference_size) HostStatsStaged stats_;
alignas(64) HostStatsStaged stats_;
};
struct TORCH_API HostAllocator : public at::Allocator {

View File

@ -183,6 +183,11 @@ struct CUDACachingHostAllocatorImpl
return true;
}
bool pinned_use_background_threads() override {
return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
pinned_use_background_threads();
}
EventPool::Event create_event_internal(DeviceIndex idx) {
// Leak the event pool to avoid shutdown issue.
static auto* event_pool = new EventPool();

View File

@ -177,6 +177,7 @@ inline void segmented_sort_pairs(
}
}
#if CUB_SUPPORTS_UNIQUE_BY_KEY()
template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename NumSelectedIteratorT>
inline void unique_by_key(
KeysInputIteratorT keys_in, ValuesInputIteratorT values_in,
@ -192,6 +193,7 @@ inline void unique_by_key(
CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSelect::UniqueByKey,
keys_in, values_in, keys_out_, values_out, num_selected, num_input_items, c10::cuda::getCurrentCUDAStream());
}
#endif
namespace impl {
@ -577,6 +579,7 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
#endif
}
#if CUB_SUPPORTS_SCAN_BY_KEY()
template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT>
inline void inclusive_sum_by_key(KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, int64_t num_items) {
@ -604,6 +607,7 @@ inline void inclusive_scan_by_key(KeysInputIteratorT keys, ValuesInputIteratorT
#endif
}
#endif
template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
void unique(InputIteratorT input, OutputIteratorT output,

View File

@ -28,6 +28,22 @@
#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
#endif
// cub support for UniqueByKey is added to cub 1.16 in:
// https://github.com/NVIDIA/cub/pull/405
#if CUB_VERSION >= 101600
#define CUB_SUPPORTS_UNIQUE_BY_KEY() true
#else
#define CUB_SUPPORTS_UNIQUE_BY_KEY() false
#endif
// cub support for scan by key is added to cub 1.15
// in https://github.com/NVIDIA/cub/pull/376
#if CUB_VERSION >= 101500
#define CUB_SUPPORTS_SCAN_BY_KEY() 1
#else
#define CUB_SUPPORTS_SCAN_BY_KEY() 0
#endif
// cub support for cub::FutureValue is added to cub 1.15 in:
// https://github.com/NVIDIA/cub/pull/305
#if CUB_VERSION >= 101500

View File

@ -160,10 +160,6 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({
DispatchKey::CUDA,
DispatchKey::CPU,
DispatchKey::PrivateUse1,
DispatchKey::SparseCPU,
DispatchKey::SparseCUDA,
DispatchKey::SparseCsrCPU,
DispatchKey::SparseCsrCUDA,
});
inline DispatchKeySet getKeysToPropagateToWrapper(const Tensor& tensor, DispatchKeySet to_propagate=kKeysToPropagateToWrapper) {

View File

@ -658,7 +658,6 @@ static void check_shape_forward(const at::Tensor& input,
TORCH_CHECK(!params.is_output_padding_neg(), "negative output_padding is not supported");
TORCH_CHECK(!params.is_stride_nonpos(), "non-positive stride is not supported");
TORCH_CHECK(!params.is_dilation_neg(), "dilation should be greater than zero");
TORCH_CHECK(groups > 0, "expected groups to be greater than 0, but got groups=", groups);
TORCH_CHECK(weight_dim == k,
"Expected ", weight_dim, "-dimensional input for ", weight_dim,

View File

@ -3620,7 +3620,7 @@ Tensor& _int_mm_out_cpu(const Tensor& self, const Tensor& mat2, Tensor& result)
try {
mkldnn_matmul_i8i8i32(self, mat2, result);
dispatched = true;
} catch ([[maybe_unused]] const std::exception& e) {
} catch (const std::exception& e) {
TORCH_WARN(func_name, " failed, switching to BLAS gemm: ", e.what());
}
}

View File

@ -2322,23 +2322,12 @@ _scaled_nvfp4_nvfp4(
const Tensor& scale_b, const SwizzleType swizzle_b,
const std::optional<Tensor>& bias,
const c10::ScalarType out_dtype,
Tensor& out,
const std::optional<Tensor>& global_scale_a = std::nullopt,
const std::optional<Tensor>& global_scale_b = std::nullopt) {
const bool single_scale,
Tensor& out) {
#ifdef USE_ROCM
TORCH_CHECK_NOT_IMPLEMENTED(false, "NVFP4 scaling not supported on ROCM");
#endif
std::optional<Tensor> alpha = std::nullopt;
// Note: "Or" here means that if only one scale is passed, we check for the other. Otherwise,
// if this is "And" we would silently do nothing in the case where one global scale is
// passed and not the other.
if (global_scale_a.has_value() || global_scale_b.has_value()) {
TORCH_CHECK_VALUE(global_scale_a.has_value(),
"For two-level-scaled NVFP4, global_scale_a must have a value");
TORCH_CHECK_VALUE(global_scale_b.has_value(),
"For two-level-scaled NVFP4, global_scale_b must have a value");
alpha = global_scale_a.value().mul(global_scale_b.value());
}
TORCH_CHECK_VALUE(single_scale, "Only single-scaled NVFP4 currently supported");
// Restrictions:
// A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
// Scales must be swizzled
@ -2360,7 +2349,7 @@ _scaled_nvfp4_nvfp4(
auto scaling_choice_a = ScalingType::BlockWise1x16;
auto scaling_choice_b = ScalingType::BlockWise1x16;
return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out, alpha);
return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
}
@ -2566,10 +2555,9 @@ _scaled_mm_cuda_v2_out(
} else if (gemm_impl == ScaledGemmImplementation::MXFP8_MXFP8) {
return _scaled_mxfp8_mxfp8(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
} else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4) {
return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out,
scale_a[1], scale_b[1]);
TORCH_CHECK_NOT_IMPLEMENTED(false, "Only single-scale NVFP4 currently supported");
} else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE) {
return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, true /* single_scale */, out);
} else if (gemm_impl == ScaledGemmImplementation::MXFP4_MXFP4) {
return _scaled_mxfp4_mxfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
} else {

View File

@ -15,7 +15,9 @@
#include <ATen/native/cuda/block_reduce.cuh>
#include <ATen/native/cuda/thread_constants.h>
#if CUB_SUPPORTS_SCAN_BY_KEY()
#include <thrust/iterator/reverse_iterator.h>
#endif
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
@ -238,6 +240,10 @@ __global__ void renorm_kernel(
} // anonymous namespace
#if !CUB_SUPPORTS_SCAN_BY_KEY()
template<typename index_t>
void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
#endif
Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices_,
int64_t num_weights, int64_t padding_idx,
@ -300,6 +306,7 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
if (scale_grad_by_freq) {
count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
#if CUB_SUPPORTS_SCAN_BY_KEY()
AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@ -326,6 +333,11 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
num_indices
);
});
#else
AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
});
#endif
}
return embedding_backward_cuda_kernel(grad, orig_indices,

View File

@ -10,7 +10,9 @@
#include <c10/macros/Macros.h>
#if CUB_SUPPORTS_UNIQUE_BY_KEY()
#include <thrust/iterator/counting_iterator.h>
#endif
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
@ -194,9 +196,18 @@ __global__ void compute_num_of_partial_segments(const index_t *partials_per_segm
partials_per_segment_offset[num_of_segments-1];
}
#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
__global__ void write_num_of_segments_for_legacy_thrust_path(int64_t *num_of_segments_ptr, int64_t num_of_segments) {
*num_of_segments_ptr = num_of_segments;
}
#endif
} // anon namespace
#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
template<typename index_t>
int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets);
#endif
Tensor embedding_backward_cuda_kernel(
const Tensor &grad,
@ -223,12 +234,20 @@ Tensor embedding_backward_cuda_kernel(
auto segment_offsets = at::empty({numel}, orig_indices.options());
auto num_of_segments_tensor = at::empty({}, grad.options().dtype(kLong));
int64_t *num_of_segments_ptr = num_of_segments_tensor.mutable_data_ptr<int64_t>();
#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
int64_t num_of_segments = embedding_backward_cuda_kernel_unique_by_key<index_t>(sorted_indices, segment_offsets);
write_num_of_segments_for_legacy_thrust_path<<<1, 1, 0, c10::cuda::getCurrentCUDAStream()>>>(num_of_segments_ptr, num_of_segments);
C10_CUDA_KERNEL_LAUNCH_CHECK();
});
#else
AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
cuda::cub::unique_by_key(
sorted_indices.const_data_ptr<index_t>(), thrust::make_counting_iterator(0),
segment_offsets.mutable_data_ptr<index_t>(),
num_of_segments_ptr, sorted_indices.numel());
});
#endif
int64_t max_segments = std::min<int64_t>(numel, num_weights);

View File

@ -31,10 +31,16 @@
#include <c10/macros/Macros.h>
#if CUB_SUPPORTS_SCAN_BY_KEY()
#include <thrust/iterator/reverse_iterator.h>
#endif
namespace at::native {
#if !CUB_SUPPORTS_SCAN_BY_KEY()
template<typename index_t>
void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
#endif
namespace {
@ -193,6 +199,7 @@ Tensor embedding_bag_backward_cuda_sum_avg(
if (scale_grad_by_freq) {
count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
#if CUB_SUPPORTS_SCAN_BY_KEY()
AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@ -219,6 +226,11 @@ Tensor embedding_bag_backward_cuda_sum_avg(
num_indices
);
});
#else
AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
});
#endif
}
return embedding_backward_cuda_kernel(grad, orig_indices, sorted_indices,
count, num_weights, padding_idx, mode == EmbeddingBagMode::MEAN, offset2bag,

View File

@ -0,0 +1,90 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/core/Tensor.h>
#include <ATen/native/cuda/SortingCommon.cuh>
#include <ATen/cuda/cub_definitions.cuh>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#else
#include <ATen/ops/empty_like.h>
#endif
#include <ATen/cuda/ThrustAllocator.h>
#include <thrust/device_ptr.h>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/device_ptr.h>
#include <thrust/iterator/constant_iterator.h>
namespace at::native {
#if !CUB_SUPPORTS_SCAN_BY_KEY()
template<typename index_t>
void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count) {
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
at::cuda::ThrustAllocator allocator;
auto policy = thrust::cuda::par(allocator).on(stream);
auto num_indices = count.numel();
// Compute an increasing sequence per unique item in sortedIndices:
// sorted: 2 5 5 5 7 7 8 9 9
// count: 1 1 2 3 1 2 1 1 2
auto sorted_data = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>());
auto count_data = thrust::device_ptr<index_t>(count.mutable_data_ptr<index_t>());
thrust::inclusive_scan_by_key(
policy,
sorted_data,
sorted_data + num_indices,
thrust::make_constant_iterator(1),
count_data
);
// Take the maximum of each count per unique key in reverse:
// sorted: 2 5 5 5 7 7 8 9 9
// count: 1 3 3 3 2 2 1 2 2
thrust::inclusive_scan_by_key(
policy,
thrust::make_reverse_iterator(sorted_data + num_indices),
thrust::make_reverse_iterator(sorted_data),
thrust::make_reverse_iterator(count_data + num_indices),
thrust::make_reverse_iterator(count_data + num_indices),
thrust::equal_to<index_t>(),
thrust::maximum<index_t>()
);
}
template
void embedding_dense_backward_cuda_scan<int>(Tensor &sorted_indices, Tensor &count);
template
void embedding_dense_backward_cuda_scan<int64_t>(Tensor &sorted_indices, Tensor &count);
#endif
template<typename index_t>
int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets) {
auto stream = at::cuda::getCurrentCUDAStream();
at::cuda::ThrustAllocator allocator;
auto policy = thrust::cuda::par(allocator).on(stream);
const ptrdiff_t numel = sorted_indices.numel();
auto sorted_indices_dev = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>());
auto dummy = at::empty_like(sorted_indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
auto dummy_dev = thrust::device_ptr<index_t>(dummy.mutable_data_ptr<index_t>());
auto ends = thrust::unique_by_key_copy(
policy,
sorted_indices_dev,
sorted_indices_dev + numel,
thrust::make_counting_iterator(0),
dummy_dev,
thrust::device_ptr<index_t>(segment_offsets.mutable_data_ptr<index_t>()));
return thrust::get<0>(ends) - dummy_dev;
}
template
int64_t embedding_backward_cuda_kernel_unique_by_key<int>(const Tensor &sorted_indices, Tensor &segment_offsets);
template
int64_t embedding_backward_cuda_kernel_unique_by_key<int64_t>(const Tensor &sorted_indices, Tensor &segment_offsets);
} // namespace at::native

View File

@ -146,7 +146,6 @@ __global__ void nll_loss2d_backward_no_reduce_kernel(
int64_t batch_size = target.size(0);
int64_t H = target.size(1);
int64_t W = target.size(2);
int64_t n_classes = grad_input.size(1);
CUDA_KERNEL_LOOP(index, n_threads) {
const int64_t b = index % batch_size;
@ -157,7 +156,6 @@ __global__ void nll_loss2d_backward_no_reduce_kernel(
if (cur_target == ignore_index) {
continue;
}
CUDA_KERNEL_ASSERT(cur_target >= 0 && cur_target < n_classes);
scalar_t value = -(weight != nullptr ? weight[cur_target] : static_cast<scalar_t>(1));
grad_input[b][cur_target][h][w] = value * grad_output[b][h][w];
}

View File

@ -413,12 +413,14 @@ struct ReduceOp {
value = thread_reduce<output_vec_size>(input_slice);
}
if (config.should_block_x_reduce()) {
value = block_x_reduce<output_vec_size>(value, shared_memory);
}
if (config.should_block_y_reduce()) {
value = block_y_reduce<output_vec_size>(value, shared_memory);
}
__syncthreads();
if (config.should_block_x_reduce()) {
value = block_x_reduce<output_vec_size>(value, shared_memory);
}
using out_ptr_vec_t = std::array<out_scalar_t*, output_vec_size>;
using offset_vec_t = std::array<index_t, output_vec_size>;
offset_vec_t base_offsets;
@ -655,8 +657,8 @@ struct ReduceOp {
__syncthreads();
// Intra-warp reduction, fix CUDA to have offset decreasing for better numerics
// matching Triton, etc.
// TODO(PaulZhang12): AMD and internal
#if defined(USE_ROCM) || defined(FBCODE_CAFFE2)
// todo for AMD
#ifdef USE_ROCM
for (int offset = 1; offset < dim_x; offset <<= 1) {
#else
for (int offset = dim_x >> 1; offset > 0; offset >>= 1) {

View File

@ -19,6 +19,7 @@
namespace at::native {
// TODO: remove this when CUDA <11.6 is no longer supported
void topk_out_with_sort(
const Tensor& self,
int64_t k, int64_t dim, bool largest,
@ -30,12 +31,21 @@ void topk_out_with_sort(
indices.copy_(sorted_indices.narrow(dim, 0, k));
}
// TODO: remove this when CUDA <11.6 is no longer supported
bool disable_sort_for_topk();
bool should_use_sort(const Tensor& self, int64_t dim) {
#if defined(USE_ROCM)
if (self.dtype() == kBool) return false; // Bool sort not supported in ROCm: https://github.com/pytorch/pytorch/issues/139972
return (self.numel() >= 10000 && self.numel() == self.size(dim)); // based on the experiments in https://github.com/pytorch/pytorch/pull/146387
#else
return false;
if (disable_sort_for_topk()) return false;
// This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/68632
if (self.dim() == 0) return false;
if (self.dtype() == kBool) return false; // Bool is not support by topk
int64_t slice_size = self.size(dim);
if (slice_size == 0) return false;
int64_t num_slices = self.numel() / slice_size;
return num_slices <= 10 && slice_size >= 100000;
#endif
}

View File

@ -21,6 +21,11 @@ using namespace at::native;
namespace at::native {
// TODO: remove this when CUDA <11.6 is no longer supported
bool disable_sort_for_topk() {
return CUB_SUPPORTS_SCAN_BY_KEY();
}
namespace sbtopk { // single_block_topk
template <typename T>
@ -413,6 +418,10 @@ __global__ void computeBlockwiseWithinKCounts(
}
__syncthreads();
#if !CUB_SUPPORTS_SCAN_BY_KEY()
return;
#endif
Bitwise desired_digit = at::cuda::Bitfield<Bitwise>::getBitfield(desired, current_bit, RADIX_BITS);
// if largest, then only threads that has tidx > desired_digit are active
@ -468,6 +477,7 @@ __global__ void computeBlockwiseWithinKCounts(
}
}
#if CUB_SUPPORTS_SCAN_BY_KEY()
// Assumption: slice_size can not be larger than UINT32_MAX
template <typename Bitwise>
__global__ void computeBlockwiseKthCounts(
@ -599,6 +609,7 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo<const T, IndexType> inpu
}
}
}
#endif
int get_items_per_thread(uint64_t num_slices, uint64_t slice_size) {
// occupancy of this kernel is limited by registers per threads
@ -676,12 +687,16 @@ void launch(
uint32_t* digit_cum_sum = reinterpret_cast<uint32_t*>(digit_cum_sum_buffer.get());
AT_CUDA_CHECK(cudaMemsetAsync(digit_cum_sum, 0, numInputSlices * RADIX_DIGITS * sizeof(uint32_t), stream));
#if CUB_SUPPORTS_SCAN_BY_KEY()
auto withinKCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
uint32_t* withinKCounts = reinterpret_cast<uint32_t*>(withinKCounts_buffer.get());
AT_CUDA_CHECK(cudaMemsetAsync(withinKCounts, 0, num_blocks * sizeof(uint32_t), stream));
auto kthCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
uint32_t* kthCounts = reinterpret_cast<uint32_t*>(kthCounts_buffer.get());
#else
uint32_t* withinKCounts = nullptr;
#endif
Bitwise desiredMask = 0;
dim3 grid;
@ -728,6 +743,7 @@ void launch(
}
desired = desired_in;
#if CUB_SUPPORTS_SCAN_BY_KEY()
computeBlockwiseKthCounts<Bitwise><<<std::min(((int64_t)numInputSlices + 255) / 256, (int64_t)1073741824), 256, 0, stream>>>(
desired, counts, num_blocks, blocks_per_slice, kthCounts);
C10_CUDA_KERNEL_LAUNCH_CHECK();
@ -743,6 +759,28 @@ void launch(
topK, topKWithinSliceStride, indices, indicesWithinSliceStride, items_per_thread,
blocks_per_slice, kthValues, withinKCounts, kthCounts, num_blocks);
C10_CUDA_KERNEL_LAUNCH_CHECK();
#else
// Find topk values based on kth values
{
dim3 grid;
TORCH_INTERNAL_ASSERT(getGridFromTiles(numInputSlices, grid), "Too many slices for topk");
int warp_size = at::cuda::warp_size();
dim3 block(std::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)warp_size) * (int64_t)warp_size, (int64_t)1024));
sbtopk::gatherTopK<T, IndexType, Dim, /* WithKthValues= */true><<<grid, block, 0, stream>>>(
input,
inputSliceSize,
outputSliceSize,
largest,
numInputSlices,
inputWithinSliceStride,
topK,
topKWithinSliceStride,
indices,
indicesWithinSliceStride,
kthValues);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
#endif
}
} // namespace mbtopk
@ -750,6 +788,7 @@ void launch(
bool should_use_multiblock(int64_t num_slices, int64_t slice_size) {
if (num_slices > std::numeric_limits<uint32_t>::max() ||
slice_size > std::numeric_limits<uint32_t>::max()) return false;
#if CUB_SUPPORTS_SCAN_BY_KEY()
// This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/74267
return (num_slices <= 20 && slice_size >= 20000) ||
(num_slices > 20 && num_slices <= 40 && slice_size >= 10000) ||
@ -758,6 +797,12 @@ bool should_use_multiblock(int64_t num_slices, int64_t slice_size) {
(num_slices >= 200 && num_slices < 800 && slice_size >= 3000) ||
(num_slices >= 800 && num_slices <= 4000 && slice_size >= 800) ||
(num_slices > 4000 && slice_size >= 400);
#else
// This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/71081
return (num_slices <= 400 && slice_size >= 5000) ||
(num_slices > 400 && num_slices < 4000 && slice_size >= 1000) ||
(num_slices >= 4000 && slice_size >= 300);
#endif
}
void launch_gather_topk_kernel(

View File

@ -44,7 +44,7 @@ __global__ void triu_tril_kernel(
const int64_t k,
const int64_t N_padded,
const IndexType last_dim_padded) {
int64_t linear_idx = (((int64_t)blockIdx.x) * blockDim.x + threadIdx.x) * elements_per_thread;
int64_t linear_idx = (blockIdx.x * blockDim.x + threadIdx.x) * elements_per_thread;
if (linear_idx >= N_padded) {
return;
}

View File

@ -127,6 +127,29 @@ __global__ void upsample_bilinear2d_nhwc_out_frame(
}
}
#ifdef USE_ROCM
// Helper function to compute output pixel range that can contribute to input pixel
template <typename accscalar_t>
__device__ __forceinline__ void compute_output_range(
int input_pos,
accscalar_t scale,
int output_size,
bool align_corners,
int& min_output,
int& max_output) {
accscalar_t lo, hi;
if (align_corners) {
lo = static_cast<accscalar_t>(input_pos - 1) / scale;
hi = static_cast<accscalar_t>(input_pos + 1) / scale;
} else {
lo = (input_pos - static_cast<accscalar_t>(0.5)) / scale - static_cast<accscalar_t>(0.5);
hi = (input_pos + static_cast<accscalar_t>(1.5)) / scale - static_cast<accscalar_t>(0.5);
}
min_output = max(0, static_cast<int>(ceil(lo)));
max_output = min(output_size - 1, static_cast<int>(floor(hi)));
}
#endif
// Backward (adjoint) operation 1 <- 2 (accumulates)
template <typename scalar_t, typename accscalar_t>
C10_LAUNCH_BOUNDS_1(1024)
@ -141,8 +164,74 @@ __global__ void upsample_bilinear2d_backward_out_frame(
const bool align_corners,
scalar_t* __restrict__ idata,
const scalar_t* __restrict__ odata) {
const size_t o_numel = nc * width2 * height2;
// In C++, integer multiplication, like in standard arithmetic, is generally commutative.
const size_t i_numel = nc * width1 * height1;
#ifdef USE_ROCM
for (size_t index = blockDim.x * blockIdx.x + threadIdx.x; index < i_numel;
index += blockDim.x * gridDim.x) {
// Decode input pixel coordinates
size_t index_temp = index;
const int w1 = index_temp % width1;
index_temp /= width1;
const int h1 = index_temp % height1;
const size_t nc_idx = index_temp / height1;
accscalar_t grad_sum = 0;
// Find range of output pixels that could interpolate from this input pixel
int h2_min, h2_max, w2_min, w2_max;
compute_output_range<accscalar_t>(h1, rheight, height2, align_corners, h2_min, h2_max);
compute_output_range<accscalar_t>(w1, rwidth, width2, align_corners, w2_min, w2_max);
// Iterate over potential output pixels
for (int h2 = h2_min; h2 <= h2_max; h2++) {
for (int w2 = w2_min; w2 <= w2_max; w2++) {
// Compute source coordinates for this output pixel
const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
rheight, h2, align_corners, /*cubic=*/false);
const int h1_base = (int)h1r;
const int h1p = (h1_base < height1 - 1) ? 1 : 0;
const accscalar_t h1lambda = h1r - h1_base;
const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
rwidth, w2, align_corners, /*cubic=*/false);
const int w1_base = (int)w1r;
const int w1p = (w1_base < width1 - 1) ? 1 : 0;
const accscalar_t w1lambda = w1r - w1_base;
const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
// Check if our input pixel participates in this interpolation and accumulate all weights
// At boundaries, h1p=0 or w1p=0 causes some sampling positions to collapse
// to the same pixel, so we need to accumulate weights from all matching positions
accscalar_t weight = 0;
// Check all four interpolation positions and accumulate weights
if (h1 == h1_base && w1 == w1_base) {
weight += h0lambda * w0lambda; // top-left
}
if (h1 == h1_base && w1 == w1_base + w1p) {
weight += h0lambda * w1lambda; // top-right (may be same as top-left if w1p=0)
}
if (h1 == h1_base + h1p && w1 == w1_base) {
weight += h1lambda * w0lambda; // bottom-left (may be same as top-left if h1p=0)
}
if (h1 == h1_base + h1p && w1 == w1_base + w1p) {
weight += h1lambda * w1lambda; // bottom-right (may collapse to other positions)
}
if (weight > 0) {
const size_t output_idx = nc_idx * height2 * width2 + h2 * width2 + w2;
grad_sum += weight * static_cast<accscalar_t>(odata[output_idx]);
}
}
}
// Write accumulated gradient (no atomics needed)
idata[index] = static_cast<scalar_t>(grad_sum);
}
#else
const size_t o_numel = nc * width2 * height2;
for (size_t index = blockDim.x * blockIdx.x + threadIdx.x; index < o_numel;
index += blockDim.x * gridDim.x) {
size_t index_temp = index;
@ -191,6 +280,7 @@ __global__ void upsample_bilinear2d_backward_out_frame(
static_cast<scalar_t>(h1lambda * w1lambda * d2val),
true);
}
#endif
}
template <typename scalar_t, typename accscalar_t>
@ -387,7 +477,6 @@ static void upsample_bilinear2d_backward_out_cuda_template(
// threads are not covering the whole input tensor.
grad_input.zero_();
const size_t num_kernels = nbatch * channels * output_height * output_width;
const int num_threads = std::min(
at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@ -397,6 +486,12 @@ static void upsample_bilinear2d_backward_out_cuda_template(
return;
}
#ifdef USE_ROCM
constexpr bool use_input = true;
#else
constexpr bool use_input = false;
#endif
AT_DISPATCH_FLOATING_TYPES_AND2(
at::ScalarType::Half, at::ScalarType::BFloat16,
grad_output_.scalar_type(), "upsample_bilinear2d_backward_out_frame", [&] {
@ -414,6 +509,8 @@ static void upsample_bilinear2d_backward_out_cuda_template(
const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
input_width, output_width, align_corners, scales_w);
const size_t num_kernels = nbatch * channels * output_height * output_width;
upsample_bilinear2d_backward_nhwc_out_frame<scalar_t, accscalar_t>
<<<ceil_div(num_kernels, static_cast<size_t>(num_threads)), num_threads, 0, stream>>>(
input_height,
@ -444,6 +541,8 @@ static void upsample_bilinear2d_backward_out_cuda_template(
const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
input_width, output_width, align_corners, scales_w);
const size_t num_kernels = nbatch * channels * (use_input ? input_height * input_width : output_height * output_width);
upsample_bilinear2d_backward_out_frame<scalar_t, accscalar_t>
<<<ceil_div(num_kernels, static_cast<size_t>(num_threads)),
num_threads,

View File

@ -466,7 +466,7 @@ struct ReduceJitOp {
__syncthreads();
#if defined(USE_ROCM) || defined(FBCODE_CAFFE2)
#ifdef USE_ROCM
for (int offset = 1; offset < dim_x; offset <<= 1) {
#else
for (int offset = dim_x >> 1; offset > 0; offset >>= 1) {

View File

@ -487,7 +487,9 @@ std::unique_ptr<fe::graph::Graph> build_graph(
auto scaled_dot_product_flash_attention_options =
fe::graph::SDPA_attributes()
.set_name("CUDNN_SDPA")
.set_generate_stats(return_softmaxstats)
.set_is_inference(return_softmaxstats == false)
// TODO(eqy): switch to this API once cuDNN FE is upgraded
// .set_generate_stats(return_softmaxstats)
.set_causal_mask(is_causal)
.set_attn_scale(attn_scale);
if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
@ -705,7 +707,9 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
auto scaled_dot_product_flash_attention_options =
fe::graph::SDPA_attributes()
.set_name("CUDNN_SDPA_NESTEDTENSOR")
.set_generate_stats(return_softmaxstats)
.set_is_inference(return_softmaxstats == false)
// TODO(eqy): switch to this API once cuDNN FE is upgraded
// .set_generate_stats(return_softmaxstats)
.set_causal_mask(is_causal)
.set_attn_scale(attn_scale)
.set_seq_len_q(SEQ_LEN_Q_)

View File

@ -441,7 +441,7 @@ kernel void applySYRK(
uint3 tid [[thread_position_in_threadgroup]],
uint3 tgid [[threadgroup_position_in_grid]],
uint3 tpg [[threads_per_threadgroup]],
uint warp_id [[simdgroup_index_in_threadgroup]]) {
uint sgitg [[simdgroup_index_in_threadgroup]]) {
const uint tx = tid.x;
const uint ty = tid.y;
const uint simdGroupsPerThreadgroup = (tpg.x * tpg.y + 31) / 32;
@ -474,8 +474,11 @@ kernel void applySYRK(
(actSize_j % 8 == 0) && (actSize_h % 8 == 0) && (actSize_k % 8 == 0);
if (use_simdgroup) {
uint warp_id = sgitg;
simdgroup_matrix<float, 8, 8> negative_identity =
simdgroup_matrix<float, 8, 8>(-1.0);
simdgroup_matrix<float, 8, 8> identity = simdgroup_matrix<float, 8, 8>(1.0);
simdgroup_matrix<float, 8, 8> Prod;
simdgroup_matrix<float, 8, 8> Afrag;
simdgroup_matrix<float, 8, 8> Bfrag;
@ -518,7 +521,8 @@ kernel void applySYRK(
/* transpose = */ upper);
simdgroup_multiply(Prod, Afrag, Bfrag);
simdgroup_multiply_accumulate(Cfrag, Prod, negative_identity, Cfrag);
simdgroup_multiply(Prod, Prod, negative_identity);
simdgroup_multiply_accumulate(Cfrag, Cfrag, identity, Prod);
}
simdgroup_store(

View File

@ -196,28 +196,6 @@ bool use_metal_mm(const Tensor& self, const Tensor& other, const Tensor& output)
other.size(0) > max_stride_size || other.size(1) > max_stride_size);
}
void map_mps_decomposition_error_code_to_blas(const Tensor& status) {
const auto& status_flat = status.view(-1);
for (const auto i : c10::irange(status_flat.size(0))) {
int code = status_flat[i].item<int>();
switch (code) {
case MPSMatrixDecompositionStatusSuccess:
status_flat[i] = 0;
break;
case MPSMatrixDecompositionStatusNonPositiveDefinite:
case MPSMatrixDecompositionStatusSingular:
status_flat[i] = 2;
break;
case MPSMatrixDecompositionStatusFailure:
status_flat[i] = -1;
break;
default:
TORCH_INTERNAL_ASSERT(false, "Unknown MPSMatrixDecompositionStatus enum value: ", code);
}
}
}
} // anonymous namespace
static void linalg_lu_factor_ex_out_mps_impl(const Tensor& A,
@ -509,9 +487,6 @@ static void linalg_solve_out_mps_impl(const Tensor& A,
"mpsmatrixdecompositionstatus for details.");
}
}
map_mps_decomposition_error_code_to_blas(info);
if (!left) {
// If this was a right solve, transpose the result back
result.copy_(result_t.transpose(-2, -1).contiguous());

View File

@ -706,7 +706,6 @@
variants: function, method
dispatch:
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_all
tags: reduction
- func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
@ -716,7 +715,6 @@
cpp_no_default_args: ['dim']
dispatch:
CompositeExplicitAutograd: all_dims_default
tags: reduction
- func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
@ -725,7 +723,6 @@
CPU, CUDA: all_out
MPS: all_out_mps
MTIA: all_out_mtia
tags: reduction
- func: all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
@ -734,16 +731,13 @@
CPU, CUDA: all_dims_out
CompositeExplicitAutograd: all_dims_out_default
cpp_no_default_args: ['dim']
tags: reduction
- func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
tags: reduction
- func: allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool
variants: function, method
@ -755,14 +749,14 @@
device_check: NoCheck # TensorIterator
structured_delegate: any.out
variants: function, method
tags: [core, reduction]
tags: core
- func: any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
structured_delegate: any.dims_out
variants: function, method
cpp_no_default_args: ['dim']
tags: [core, reduction]
tags: core
dispatch:
CompositeExplicitAutograd: any_dims_default
@ -772,7 +766,6 @@
dispatch:
CPU, CUDA: any_out
MPS: any_out_mps
tags: reduction
- func: any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
@ -781,16 +774,13 @@
CPU, CUDA: any_dims_out
CompositeExplicitAutograd: any_dims_out_default
cpp_no_default_args: ['dim']
tags: reduction
- func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
tags: reduction
- func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
dispatch:
@ -836,27 +826,25 @@
structured_delegate: argmax.out
device_check: NoCheck # TensorIterator
variants: function, method
tags: [core, reduction]
tags: core
- func: argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
structured: True
dispatch:
CPU, CUDA: argmax_out
MPS: argmax_out_mps
tags: reduction
- func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
structured_delegate: argmin.out
device_check: NoCheck # TensorIterator
variants: function, method
tags: [core, reduction]
tags: core
- func: argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
structured: True
dispatch:
CPU, CUDA: argmin_out
MPS: argmin_out_mps
tags: reduction
- func: acosh(Tensor self) -> Tensor
variants: function, method
@ -1382,7 +1370,6 @@
dispatch:
SparseCPU: bmm_sparse_cpu
SparseCUDA: bmm_sparse_cuda
SparseMPS: bmm_sparse_mps
NestedTensorCPU: bmm_nested
NestedTensorCUDA: bmm_nested_cuda
tags: core
@ -1398,7 +1385,6 @@
MTIA: bmm_out_mtia
SparseCPU: bmm_out_sparse_cpu
SparseCUDA: bmm_out_sparse_cuda
SparseMPS: bmm_out_sparse_mps
SparseCsrCUDA: bmm_out_sparse_csr_cuda
- func: bmm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
@ -1881,14 +1867,12 @@
CUDA: count_nonzero_cuda
MPS: count_nonzero_mps
autogen: count_nonzero.dim_IntList_out
tags: reduction
- func: count_nonzero(Tensor self, int? dim=None) -> Tensor
variants: function, method
dispatch:
CompositeExplicitAutograd: count_nonzero
autogen: count_nonzero.out
tags: reduction
- func: cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor
variants: function, method
@ -3809,23 +3793,19 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: logsumexp
tags: reduction
- func: logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
# calls squeeze
CompositeExplicitAutogradNonFunctional: logsumexp_out
tags: reduction
- func: logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
tags: reduction
- func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
@ -3875,7 +3855,6 @@
device_check: NoCheck # TensorIterator
structured_delegate: aminmax.out
variants: function, method
tags: reduction
- func: aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max)
device_check: NoCheck # TensorIterator
@ -3883,7 +3862,6 @@
dispatch:
CPU, CUDA, MTIA: aminmax_out
MPS: aminmax_out_mps
tags: reduction
- func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
dispatch:
@ -3899,7 +3877,7 @@
variants: function, method
dispatch:
QuantizedCPU, QuantizedCUDA: qmax
tags: [core, reduction]
tags: core
- func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
device_check: NoCheck # TensorIterator
@ -3909,16 +3887,13 @@
dispatch:
CPU, CUDA, MTIA: max_out
MPS: max_out_mps
tags: reduction
- func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
device_check: NoCheck # TensorIterator
tags: reduction
- func: value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, SymInt[] sizes, bool keepdim) -> Tensor
variants: function
@ -3931,14 +3906,13 @@
- func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
variants: function, method
structured_delegate: amax.out
tags: [core, reduction]
tags: core
- func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
structured: True
dispatch:
CPU, CUDA, MTIA: amax_out
MPS: amax_out_mps
tags: reduction
# Return: (Tensor output, Tensor indices)
- func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
@ -4000,14 +3974,13 @@
variants: function, method
dispatch:
CompositeExplicitAutograd: mean
tags: [core, reduction]
tags: core
# For normal naming convention this should be `mean.out`. However since we already have `mean.out` we have to rename this.
- func: mean.dtype_out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
CompositeExplicitAutograd: mean_dtype_out
tags: reduction
- func: mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
structured_delegate: mean.out
@ -4015,7 +3988,7 @@
variants: function, method
dispatch:
QuantizedCPU: mean_quantized_cpu
tags: [core, reduction]
tags: core
- func: mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
structured: True
@ -4024,16 +3997,13 @@
CPU, CUDA: mean_out
MPS: mean_out_mps
QuantizedCPU: mean_out_quantized_cpu
tags: reduction
- func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
tags: reduction
- func: nanmean(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
device_check: NoCheck # Composite
@ -4096,7 +4066,7 @@
variants: function, method
dispatch:
QuantizedCPU, QuantizedCUDA: qmin
tags: [core, reduction]
tags: core
- func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
device_check: NoCheck # TensorIterator
@ -4106,28 +4076,24 @@
dispatch:
CPU, CUDA, MTIA: min_out
MPS: min_out_mps
tags: reduction
- func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
device_check: NoCheck # TensorIterator
tags: reduction
- func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
variants: function, method
structured_delegate: amin.out
tags: [core, reduction]
tags: core
- func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
structured: True
dispatch:
CPU, CUDA, MTIA: amin_out
MPS: amin_out_mps
tags: reduction
# TODO: Add this function to MPS dispatch key so that we avoid declaring it in
# native_functions.yaml
@ -4207,7 +4173,7 @@
structured_delegate: mm.out
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: _sparse_mm
SparseCPU, SparseCUDA: _sparse_mm
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm
tags: core
@ -5892,7 +5858,6 @@
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sum_coo
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
autogen: sum.out
tags: reduction
- func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
# TODO: Align the signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype
@ -5903,12 +5868,11 @@
NestedTensorCPU: NestedTensor_sum_dim_CPU
SparseCPU, SparseCUDA, SparseMPS: sum_sparse_coo
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed
tags: [core, reduction]
tags: core
- func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: sum.IntList_out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
structured: True
@ -5916,11 +5880,9 @@
dispatch:
CPU, CUDA: sum_out
MPS: sum_out_mps
tags: reduction
- func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
tags: reduction
# TODO: this function will be replaced once nested expand semantics have been settled on
- func: _nested_sum_backward(Tensor grad, Tensor self, int[1]? dim, bool keepdim=False) -> Tensor
@ -5932,13 +5894,11 @@
dispatch:
CPU, CUDA: nansum
MPS: nansum_mps
tags: reduction
- func: nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU, CUDA: nansum_out
MPS: nansum_out_mps
tags: reduction
- func: hash_tensor(Tensor self, int[1] dim=[], *, bool keepdim=False, int mode=0) -> Tensor
variants: function, method
@ -6002,13 +5962,11 @@
device_check: NoCheck # TensorIterator
variants: function, method
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
@ -6017,19 +5975,16 @@
CPU, CUDA: std
MPS: std_mps
QuantizedCPU: std_quantized_cpu
tags: reduction
- func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: std_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
@ -6038,51 +5993,42 @@
CPU, CUDA: std_mean
MPS: std_mean_mps
autogen: std_mean.correction_out
tags: reduction
- func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
tags: reduction
- func: std.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
CPU, CUDA: std_out
QuantizedCPU: std_out_quantized_cpu
tags: reduction
- func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: function
tags: reduction
- func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
device_check: NoCheck # TensorIterator
@ -6091,13 +6037,13 @@
CPU, CUDA: prod
MPS: prod_mps
autogen: prod.out
tags: [core, reduction]
tags: core
- func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
structured_delegate: prod.int_out
device_check: NoCheck # TensorIterator
variants: function, method
tags: [core, reduction]
tags: core
- func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
structured: True
@ -6105,16 +6051,13 @@
dispatch:
CPU, CUDA: prod_out
MPS: prod_out_mps
tags: reduction
- func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
tags: reduction
- func: t(Tensor(a) self) -> Tensor(a)
device_check: NoCheck
@ -6575,12 +6518,11 @@
device_check: NoCheck # TensorIterator
variants: function, method
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: [core, reduction]
tags: core
cpp_no_default_args: ["unbiased"]
- func: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
@ -6590,51 +6532,43 @@
CPU, CUDA: var
MPS: var_mps
MTIA: var_mtia
tags: [core, reduction]
tags: core
- func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
CPU, CUDA: var_out
tags: reduction
- func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: function
tags: reduction
- func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: var_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
@ -6643,18 +6577,15 @@
CPU, CUDA: var_mean
MPS: var_mean_mps
autogen: var_mean.correction_out
tags: reduction
- func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
cpp_no_default_args: ["unbiased"]
tags: reduction
- func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
device_check: NoCheck # TensorIterator
variants: function
tags: reduction
- func: view_as(Tensor(a) self, Tensor other) -> Tensor(a)
variants: method
@ -6914,7 +6845,6 @@
dispatch:
CompositeExplicitAutograd: norm
autogen: norm.ScalarOpt_dtype_out
tags: reduction
- func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor
device_check: NoCheck # TensorIterator
@ -6922,7 +6852,6 @@
dispatch:
CompositeExplicitAutograd: norm
autogen: norm.Scalar_out
tags: reduction
- func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
structured_delegate: norm.dtype_out
@ -6930,7 +6859,6 @@
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sparse_dtype_norm
tags: reduction
- func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
structured_delegate: norm.out
@ -6938,7 +6866,6 @@
variants: function, method
dispatch:
SparseCPU, SparseCUDA, SparseMPS: sparse_norm
tags: reduction
- func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
structured: True
@ -6946,7 +6873,6 @@
dispatch:
CPU, CUDA: norm_dtype_out
MPS: norm_dtype_out_mps
tags: reduction
- func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
structured: True
@ -6954,26 +6880,21 @@
dispatch:
CPU, CUDA: norm_out
MPS: norm_out_mps
tags: reduction
# These four redispatch in their implementation, so OK to be CompositeImplicitAutograd
- func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor
device_check: NoCheck # TensorIterator
variants: function, method
tags: reduction
- func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
tags: reduction
- func: norm.names_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
tags: reduction
- func: frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent)
variants: method, function
@ -7191,7 +7112,6 @@
MTIA: addmm_out_mtia
SparseCPU: addmm_out_sparse_dense_cpu
SparseCUDA: addmm_out_sparse_dense_cuda
SparseMPS: addmm_out_sparse_dense_mps
SparseCsrCPU: addmm_out_sparse_compressed_cpu
SparseCsrCUDA: addmm_out_sparse_compressed_cuda
@ -7201,7 +7121,6 @@
dispatch:
SparseCPU: addmm_sparse_dense_cpu
SparseCUDA: addmm_sparse_dense_cuda
SparseMPS: addmm_sparse_dense_mps
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: addmm_sparse_compressed_dense
tags: core
@ -10159,14 +10078,12 @@
CPU, CUDA: min
MPS: min_mps
QuantizedCPU: min_quantized_cpu
tags: [reduction]
- func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
CPU, CUDA: min_unary_out
QuantizedCPU: min_quantized_unary_out
tags: [reduction]
- func: fmin(Tensor self, Tensor other) -> Tensor
structured_delegate: fmin.out
@ -10189,7 +10106,6 @@
CPU, CUDA: max
MPS: max_mps
QuantizedCPU: max_quantized_cpu
tags: [reduction]
- func: fmax(Tensor self, Tensor other) -> Tensor
structured_delegate: fmax.out
@ -10236,7 +10152,6 @@
dispatch:
CPU, CUDA: max_unary_out
QuantizedCPU: max_quantized_unary_out
tags: [reduction]
- func: minimum(Tensor self, Tensor other) -> Tensor
structured_delegate: minimum.out
@ -10356,7 +10271,6 @@
device_check: NoCheck # TensorIterator
structured_delegate: all.all_out
variants: method, function
tags: reduction
- func: all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck
@ -10365,7 +10279,6 @@
CPU, CUDA: all_all_out
MTIA: all_all_out_mtia
MPS: all_all_out_mps
tags: reduction
- func: any(Tensor self) -> Tensor
device_check: NoCheck # TensorIterator
@ -10373,7 +10286,7 @@
variants: method, function
dispatch:
SparseCPU, SparseCUDA, SparseMPS: any_sparse
tags: [core, reduction]
tags: core
- func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck
@ -10381,7 +10294,6 @@
dispatch:
CPU, CUDA: any_all_out
MPS: any_all_out_mps
tags: reduction
- func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
@ -14433,7 +14345,6 @@
python_module: linalg
variants: function
structured_delegate: linalg_vector_norm.out
tags: reduction
- func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
python_module: linalg
@ -14441,7 +14352,6 @@
dispatch:
CPU, CUDA: linalg_vector_norm_out
MPS: linalg_vector_norm_out_mps
tags: reduction
- func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
python_module: linalg

View File

@ -1,6 +1,5 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/SparseTensorUtils.h>
#include <ATen/ExpandUtils.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/native/sparse/SparseStubs.h>
#include <ATen/native/sparse/SparseBinaryOpIntersectionCommon.h>
@ -19,8 +18,6 @@
#include <ATen/ops/ones_like.h>
#include <ATen/ops/argsort.h>
#include <ATen/ops/result_type.h>
#include <ATen/ops/bmm_native.h>
#include <ATen/ops/addmm_native.h>
#include <ATen/ops/copy_sparse_to_sparse.h>
#include <ATen/ops/mul.h>
#endif
@ -36,305 +33,6 @@ static auto& lib = MetalShaderLibrary::getBundledLibrary();
#include <ATen/native/mps/Mul_metallib.h>
#endif
static Tensor& s_addmm_out_sparse_dense_mps(
Tensor& r,
const Tensor& t,
const SparseTensor& sparse_,
const Tensor& dense,
const Scalar& beta,
const Scalar& alpha) {
TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: sparse_dim must be 2, got ", sparse_.sparse_dim());
TORCH_CHECK(sparse_.dense_dim() == 0, "addmm: sparse values must be 0-dense-dim, got ", sparse_.dense_dim());
TORCH_CHECK(dense.dim() == 2, "addmm: 'dense' must be 2D, got ", dense.dim());
TORCH_CHECK(t.dim() == 2, "addmm: 't' must be 2D, got ", t.dim());
const int64_t I = sparse_.size(0);
const int64_t J = sparse_.size(1);
const int64_t K = dense.size(1);
TORCH_CHECK(dense.size(0) == J,
"addmm: dense (mat2) dim0 must be ", J, ", got ", dense.size(0));
TORCH_CHECK(t.size(0) == I && t.size(1) == K,
"addmm: 't' shape must be (", I, ", ", K, "), got (", t.size(0), ", ", t.size(1), ")");
r.resize_({I, K});
auto sparse = sparse_.coalesce();
const int64_t nnz = sparse._nnz();
if (nnz == 0 || I == 0 || K == 0) {
at::mul_out(r, t, beta);
return r;
}
const auto v_dtype = sparse._values().scalar_type();
const auto d_dtype = dense.scalar_type();
const auto t_dtype = t.scalar_type();
auto compute_dtype = c10::promoteTypes(c10::promoteTypes(v_dtype, d_dtype), t_dtype);
TORCH_CHECK(canCast(compute_dtype, r.scalar_type()),
"Can't convert computed type ", compute_dtype, " to output ", r.scalar_type());
auto indices2d = sparse._indices().contiguous();
auto values = sparse._values().to(compute_dtype);
auto dense_c = dense.to(compute_dtype).contiguous();
auto t_c = t.to(compute_dtype).contiguous();
const bool out_needs_cast = (r.scalar_type() != compute_dtype) || !r.is_contiguous();
Tensor out_buf = out_needs_cast
? at::empty({I, K}, r.options().dtype(compute_dtype))
: r;
auto out_contig = out_buf.contiguous();
auto device = r.device();
auto stream = getCurrentMPSStream();
const float alpha_f = alpha.to<float>();
const float beta_f = beta.to<float>();
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
const std::string func = "spmm_addmm_coo_" + mps::scalarToMetalTypeString(values);
auto pso = lib.getPipelineStateForFunc(func);
auto enc = stream->commandEncoder();
[enc setComputePipelineState:pso];
const uint32_t tew = pso.threadExecutionWidth;
const uint32_t gridX = static_cast<uint32_t>(K);
const uint32_t gridZ = static_cast<uint32_t>(I);
const uint32_t tgW = std::min<uint32_t>(gridX, tew);
MTLSize grid = MTLSizeMake(gridX, 1, gridZ);
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
mtl_setArgs(enc,
indices2d,
values,
dense_c,
t_c,
out_contig,
std::array<uint32_t, 3>{static_cast<uint32_t>(I),
static_cast<uint32_t>(J),
static_cast<uint32_t>(K)},
std::array<float, 2>{alpha_f, beta_f},
static_cast<uint32_t>(nnz));
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
}
});
if (out_needs_cast) {
r.copy_(out_contig.to(r.scalar_type()));
}
return r;
}
static void build_batch_ptr_mps(
const Tensor& indices_dim0,
int64_t B,
Tensor& batch_ptr
) {
// Builds an array of pointers which point to each batches elements. Example:
// idx_b = [0, 0, 0, 1, 1, 2, 2, 2, 2] // 9 non-zero elements
// └─────┘ └──┘ └─────────┘
// batch 0 batch 1 batch 2
// batch_ptr = [0, 3, 5, 9]
// │ │ │ └─ end of batch 2 (total nnz)
// │ │ └──── batch 2 starts at index 5
// │ └─────── batch 1 starts at index 3
// └────────── batch 0 starts at index 0
TORCH_CHECK(indices_dim0.is_mps() && batch_ptr.is_mps(), "MPS device expected");
auto device = indices_dim0.device();
auto stream = getCurrentMPSStream();
const int64_t nnz = indices_dim0.numel();
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
auto pso = lib.getPipelineStateForFunc("build_batch_ptr_from_sorted_batches");
auto enc = stream->commandEncoder();
[enc setComputePipelineState:pso];
const uint32_t tew = pso.threadExecutionWidth;
const uint32_t Q = static_cast<uint32_t>(B + 1);
const uint32_t tgW = std::min<uint32_t>(Q, tew);
MTLSize grid = MTLSizeMake(Q, 1, 1);
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
mtl_setArgs(enc,
indices_dim0,
batch_ptr,
std::array<uint32_t, 2>{static_cast<uint32_t>(nnz),
static_cast<uint32_t>(B)});
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
}
});
}
static void build_row_ptr_per_batch_mps(
const Tensor& rows,
const Tensor& batch_ptr,
int64_t B,
int64_t I,
Tensor& row_ptr
) {
// Build per-batch CSR-style row pointer arrays from row indices sorted by batch
// Given:
// rows: 1-D array of length nnz with row ids in [0, I), sorted within each batch
// batch_ptr: length B+1, where [batch_ptr[b], batch_ptr[b+1]) is the subrange for batch b
// Produces:
// - row_ptr: shape [B, I+1]
//
// Example (B = 2, I = 4):
// rows = [0, 0, 1, 3, 0, 2, 2] // 7 non-zero elements
// └─── batch 0 ──┘ └─ batch 1 ─┘
// batch_ptr = [0, 4, 7]
// │ │ └─ end of batch 1 (total nnz)
// │ └──── end of batch 0/start of batch 1
// └─────── start of batch 0
//
// per-batch row pointers (I+1 entries each):
// row_ptr[0] = [0, 2, 3, 3, 4]
// row_ptr[1] = [0, 1, 1, 3, 3]
// laid out in memory: [0, 2, 3, 3, 4, 0, 1, 1, 3, 3]
TORCH_CHECK(rows.is_mps() && batch_ptr.is_mps() && row_ptr.is_mps(), "MPS device expected");
auto stream = getCurrentMPSStream();
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
auto pso = lib.getPipelineStateForFunc("build_row_ptr_from_sorted_rows_by_batch");
auto enc = stream->commandEncoder();
[enc setComputePipelineState:pso];
const uint32_t tew = pso.threadExecutionWidth;
const uint32_t Qx = static_cast<uint32_t>(I + 1);
const uint32_t Qy = static_cast<uint32_t>(B);
const uint32_t tgW = std::min<uint32_t>(Qx, tew);
MTLSize grid = MTLSizeMake(Qx, Qy, 1);
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
mtl_setArgs(enc,
rows,
batch_ptr,
row_ptr,
std::array<uint32_t, 2>{static_cast<uint32_t>(I),
static_cast<uint32_t>(B)});
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
}
});
}
Tensor& bmm_out_sparse_mps(const SparseTensor& self_, const Tensor& mat2_, Tensor& result_) {
TORCH_CHECK(result_.is_mps(), "bmm_sparse: expected 'out' to be MPS, got ", result_.device());
TORCH_CHECK(self_.is_mps(), "bmm_sparse: expected 'self' to be MPS, got ", self_.device());
TORCH_CHECK(mat2_.is_mps(), "bmm_sparse: expected 'mat2' to be MPS, got ", mat2_.device());
TORCH_CHECK(self_.dense_dim() == 0, "bmm_sparse: Tensor 'self' must have 0 dense dims, but has ", self_.dense_dim());
TORCH_CHECK(self_.sparse_dim() == 3, "bmm_sparse: Tensor 'self' must have 3 sparse dims, but has ", self_.sparse_dim());
TORCH_CHECK(mat2_.dim() == 3, "bmm_sparse: Tensor 'mat2' must have 3 dims, but has ", mat2_.dim());
TORCH_CHECK(self_.size(0) == mat2_.size(0), "bmm_sparse: 'self.size(0)' and 'mat2.size(0)' must match");
TORCH_CHECK(self_.size(2) == mat2_.size(1), "bmm_sparse: 'self.size(2)' and 'mat2.size(1)' must match");
const int64_t B = self_.size(0);
const int64_t I = self_.size(1);
const int64_t J = self_.size(2);
const int64_t K = mat2_.size(2);
auto self = self_.coalesce();
const int64_t nnz = self._nnz();
if (nnz == 0) {
return result_.zero_();
}
const auto computeDtype = at::kFloat;
auto indices = self._indices();
auto values = self._values();
auto values_c = values.scalar_type() == computeDtype ? values : values.to(computeDtype);
auto mat2_c = mat2_.scalar_type() == computeDtype ? mat2_ : mat2_.to(computeDtype);
auto mat2_contig = mat2_c.contiguous();
auto idx_b = indices.select(0, 0).contiguous();
auto idx_i = indices.select(0, 1).contiguous();
auto idx_j = indices.select(0, 2).contiguous();
// builds an array of pointers of where the batch_idx's pointer starts and ends
// look in function for better explanation
auto batch_ptr = at::empty({B + 1}, at::device(result_.device()).dtype(kLong));
build_batch_ptr_mps(idx_b, B, batch_ptr);
// build row_ptr per batch: for each (b, i) get [start, end) into rows/cols/vals
auto row_ptr = at::empty({B * (I + 1)}, at::device(result_.device()).dtype(kLong));
build_row_ptr_per_batch_mps(idx_i, batch_ptr, B, I, row_ptr);
const bool out_needs_cast = (result_.scalar_type() != computeDtype) || !result_.is_contiguous();
Tensor out_buf = out_needs_cast
? at::empty({B, I, K}, result_.options().dtype(computeDtype))
: result_;
auto out_contig = out_buf.contiguous();
auto stream = getCurrentMPSStream();
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
auto pso = lib.getPipelineStateForFunc("spmm_bmm_coo_rows_grouped_" + mps::scalarToMetalTypeString(values));
auto enc = stream->commandEncoder();
[enc setComputePipelineState:pso];
const uint32_t tew = pso.threadExecutionWidth;
const uint32_t tgW = std::min<uint32_t>((uint32_t)K, tew);
// One threadgroup per (row i, batch b), lanes cover K
MTLSize grid = MTLSizeMake(tgW, (uint32_t)I, (uint32_t)B);
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
mtl_setArgs(enc,
idx_i,
idx_j,
values_c,
mat2_contig,
out_contig,
row_ptr,
std::array<uint32_t, 4>{(uint32_t)B, (uint32_t)I, (uint32_t)J, (uint32_t)K});
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
}
});
if (out_needs_cast) {
result_.copy_(out_contig.to(result_.scalar_type()));
}
return result_;
}
Tensor bmm_sparse_mps(const Tensor& self, const Tensor& mat2) {
Tensor result = at::zeros({self.size(0), self.size(1), mat2.size(2)}, mat2.options());
return bmm_out_sparse_mps(self, mat2, result);
}
Tensor& addmm_out_sparse_dense_mps(
const Tensor& self,
const SparseTensor& mat1,
const Tensor& mat2,
const Scalar& beta,
const Scalar& alpha,
Tensor& result) {
c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
return s_addmm_out_sparse_dense_mps(result, *b_self, mat1, mat2, beta, alpha);
}
Tensor addmm_sparse_dense_mps(
const Tensor& self,
const SparseTensor& mat1,
const Tensor& mat2,
const Scalar& beta,
const Scalar& alpha
) {
c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
Tensor result = at::empty({0}, self.options());
return s_addmm_out_sparse_dense_mps(result, *b_self, mat1, mat2, beta, alpha);
}
static SparseTensor& mul_out_dense_sparse_mps(
const Tensor& dense,
const Tensor& sparse,

View File

@ -1,105 +1,10 @@
#include <metal_stdlib>
#include <c10/metal/indexing.h>
#include <c10/metal/utils.h>
using namespace c10::metal;
using namespace metal;
inline uint lower_bound_i64(device const long* arr, uint lo, uint hi, long key) {
uint l = lo, r = hi;
while (l < r) {
uint m = (l + r) >> 1;
long v = arr[m];
if (v < key) {
l = m + 1;
} else {
r = m;
}
}
return l;
}
inline uint upper_bound_i64(device const long* arr, uint lo, uint hi, long key) {
uint l = lo, r = hi;
while (l < r) {
uint m = (l + r) >> 1;
long v = arr[m];
if (v <= key) {
l = m + 1;
} else {
r = m;
}
}
return l;
}
kernel void build_row_ptr_from_sorted_rows_by_batch(
device const long* rows [[buffer(0)]],
device const long* batch_ptr [[buffer(1)]],
device long* row_ptr [[buffer(2)]],
constant uint2& dims [[buffer(3)]],
uint3 tid [[thread_position_in_grid]])
{
const uint I = dims.x;
const uint B = dims.y;
const uint i = tid.x;
const uint b = tid.y;
if (b >= B || i > I) return;
const uint base = (uint)batch_ptr[b];
const uint lim = (uint)batch_ptr[b + 1];
const ulong out_base = (ulong)b * (ulong)(I + 1);
if (i == I) {
row_ptr[out_base + (ulong)I] = (long)lim;
} else {
const long key = (long)i;
const uint pos = lower_bound_i64(rows, base, lim, key);
row_ptr[out_base + (ulong)i] = (long)pos;
}
}
template <typename T>
kernel void spmm_bmm_coo_rows_grouped(
device const long* rows [[buffer(0)]],
device const long* cols [[buffer(1)]],
device const T* vals [[buffer(2)]],
device const T* dense [[buffer(3)]],
device T* out [[buffer(4)]],
device const long* row_ptr [[buffer(5)]],
constant uint4& dims [[buffer(6)]],
uint3 tid [[thread_position_in_grid]],
uint3 ltid [[thread_position_in_threadgroup]],
uint3 tptg [[threads_per_threadgroup]])
{
const uint B = dims.x;
const uint I = dims.y;
const uint J = dims.z;
const uint K = dims.w;
const uint b = tid.z;
const uint i = tid.y;
const uint lane = ltid.x;
const uint tgW = tptg.x;
const ulong rp_base = (ulong)b * (ulong)(I + 1);
const uint start = (uint)row_ptr[rp_base + (ulong)i];
const uint end = (uint)row_ptr[rp_base + (ulong)i + 1];
for (uint k = lane; k < K; k += tgW) {
auto acc = static_cast<accum_t<T>>(T(0));
for (uint p = start; p < end; ++p) {
const uint c = (uint)cols[p];
const auto v = static_cast<accum_t<T>>(vals[p]);
const uint d_off = ((b * J) + c) * K + k;
const auto d = static_cast<accum_t<T>>(dense[d_off]);
acc += mul(v, d);
}
const uint y_off = ((b * I) + i) * K + k;
out[y_off] = static_cast<T>(acc);
}
}
template <typename T> struct MulAccum { using type = float; };
template <> struct MulAccum<float2> { using type = float2; };
template <typename T>
kernel void dense_sparse_mul_kernel(
@ -127,9 +32,10 @@ kernel void dense_sparse_mul_kernel(
ulong dense_idx = (ulong)key * (ulong)view_cols + (ulong)col;
ulong val_idx = (ulong)i * (ulong)view_cols + (ulong)col;
const auto a = static_cast<accum_t<T>>(values[val_idx]);
const auto b = static_cast<accum_t<T>>(dense[dense_idx]);
out_values[val_idx] = static_cast<T>(mul(a, b));
using accum_t = typename MulAccum<T>::type;
const accum_t a = static_cast<accum_t>(values[val_idx]);
const accum_t b = static_cast<accum_t>(dense[dense_idx]);
out_values[val_idx] = static_cast<T>(a * b);
}
kernel void intersect_binary_search(
@ -214,76 +120,6 @@ kernel void fused_gather_mul_kernel(
}
}
kernel void build_batch_ptr_from_sorted_batches(
device const long* batches [[buffer(0)]],
device long* batch_ptr [[buffer(1)]],
constant uint2& nnz_B [[buffer(2)]],
uint3 tid [[thread_position_in_grid]])
{
uint b = tid.x;
uint nnz = nnz_B.x;
uint batch = nnz_B.y;
if (b == batch) {
batch_ptr[b] = (long)nnz;
return;
}
uint lo = 0;
uint hi = nnz;
long key = (long)b;
while (lo < hi) {
uint mid = (lo + hi) >> 1;
long v = batches[mid];
if (v < key) lo = mid + 1;
else hi = mid;
}
batch_ptr[b] = (long)lo;
}
template <typename T>
kernel void spmm_addmm_coo(
device const long* indices2d [[buffer(0)]],
device const T* vals [[buffer(1)]],
device const T* dense [[buffer(2)]],
device const T* t_in [[buffer(3)]],
device T* out [[buffer(4)]],
constant uint3& dims [[buffer(5)]],
constant float2& alpha_beta [[buffer(6)]],
constant uint& nnz [[buffer(7)]],
uint3 tid [[thread_position_in_grid]])
{
const uint K = dims.z;
const uint k = tid.x;
const uint i = tid.z;
const float alpha = alpha_beta.x;
const float beta = alpha_beta.y;
device const long* rows = indices2d;
device const long* cols = indices2d + nnz;
const uint start = lower_bound_i64(rows, 0u, nnz, (long)i);
const uint end = upper_bound_i64(rows, 0u, nnz, (long)i);
// accumulator is float for scalar/half/bfloat and float2 for float2
auto acc = static_cast<accum_t<T>>(T(0));
for (uint p = start; p < end; ++p) {
const uint c = (uint)cols[p];
const auto v = static_cast<accum_t<T>>(vals[p]);
const uint dense_off = c * K + k;
const auto d = static_cast<accum_t<T>>(dense[dense_off]);
acc += mul(v, d);
}
const uint off = i * K + k;
const auto base = (beta != 0.0f) ? (static_cast<accum_t<T>>(t_in[off]) * beta) : static_cast<accum_t<T>>(T(0));
const auto y = base + alpha * acc;
out[off] = static_cast<T>(y);
}
#define INSTANTIATE_DENSE_SPARSE_MUL(DTYPE) \
template [[host_name("dense_sparse_mul_kernel_" #DTYPE)]] kernel void \
dense_sparse_mul_kernel<DTYPE>( \
@ -315,36 +151,6 @@ INSTANTIATE_DENSE_SPARSE_MUL(float2);
constant uint2& dims_output [[buffer(8)]], \
uint3 gid [[thread_position_in_grid]]);
INSTANTIATE_FOR_FLOAT_TYPES(INSTANTIATE_FUSED_GATHER_MUL);
#define INSTANTIATE_SPMM_BMM_COO_ROWS_GROUPED(DTYPE) \
template [[host_name("spmm_bmm_coo_rows_grouped_" #DTYPE)]] kernel void \
spmm_bmm_coo_rows_grouped<DTYPE>( \
device const long* rows [[buffer(0)]], \
device const long* cols [[buffer(1)]], \
device const DTYPE* vals [[buffer(2)]], \
device const DTYPE* dense [[buffer(3)]], \
device DTYPE* out [[buffer(4)]], \
device const long* row_ptr [[buffer(5)]], \
constant uint4& dims [[buffer(6)]], \
uint3 tid [[thread_position_in_grid]], \
uint3 ltid [[thread_position_in_threadgroup]], \
uint3 tptg [[threads_per_threadgroup]]);
INSTANTIATE_FOR_ALL_TYPES(INSTANTIATE_SPMM_BMM_COO_ROWS_GROUPED);
#define INSTANTIATE_SPMM_ADDMM_COO(DTYPE) \
template [[host_name("spmm_addmm_coo_" #DTYPE)]] kernel void \
spmm_addmm_coo<DTYPE>( \
device const long* indices2d [[buffer(0)]], \
device const DTYPE* vals [[buffer(1)]], \
device const DTYPE* dense [[buffer(2)]], \
device const DTYPE* t_in [[buffer(3)]], \
device DTYPE* out [[buffer(4)]], \
constant uint3& dims [[buffer(5)]], \
constant float2& alpha_beta [[buffer(6)]], \
constant uint& nnz [[buffer(7)]], \
uint3 tid [[thread_position_in_grid]]);
INSTANTIATE_FOR_ALL_TYPES(INSTANTIATE_SPMM_ADDMM_COO);
INSTANTIATE_FUSED_GATHER_MUL(float);
INSTANTIATE_FUSED_GATHER_MUL(half);
INSTANTIATE_FUSED_GATHER_MUL(bfloat);

View File

@ -93,7 +93,3 @@
This operator does not support cudagraphs. The presence of this tag on an operator will cause
Inductor to split the graph around this operator. Note that operators without this tag may still
not support CUDAGraphs. Inductor may have other hardcoded lists around that.
- tag: reduction
desc: |
This tag indicates that an operator performs a reduction operation, computing aggregate values
(sum, mean, max, min, etc.) across one or more dimensions of the input tensor(s).

View File

@ -202,6 +202,7 @@ supported:
- select_backward
- _trilinear
- linalg_pinv.atol_rtol_tensor
- svd
- logsumexp.out
symint:
- empty.memory_format

View File

@ -1751,8 +1751,8 @@ def maybe_snapshot_memory(should_snapshot_memory, suffix):
f"{output_filename.rstrip('.csv')}_{suffix}.pickle",
)
)
except Exception:
log.exception("Failed to save memory snapshot")
except Exception as e:
log.error("Failed to save memory snapshot, %s", e)
torch.cuda.memory._record_memory_history(enabled=None)
@ -2284,11 +2284,9 @@ class BenchmarkRunner:
)
):
is_same = False
except Exception as e:
except Exception:
# Sometimes torch.allclose may throw RuntimeError
exception_string = str(e)
accuracy_status = f"fail_exception: {exception_string}"
return record_status(accuracy_status, dynamo_start_stats=start_stats)
is_same = False
if not is_same:
accuracy_status = "eager_two_runs_differ"
@ -2405,11 +2403,9 @@ class BenchmarkRunner:
force_max_multiplier=force_max_multiplier,
):
is_same = False
except Exception as e:
except Exception:
# Sometimes torch.allclose may throw RuntimeError
exception_string = str(e)
accuracy_status = f"fail_exception: {exception_string}"
return record_status(accuracy_status, dynamo_start_stats=start_stats)
is_same = False
if not is_same:
if self.args.skip_accuracy_check:

View File

@ -124,7 +124,7 @@ with open(MODELS_FILENAME) as fh:
continue
batch_size = int(batch_size)
BATCH_SIZE_KNOWN_MODELS[model_name] = batch_size
assert BATCH_SIZE_KNOWN_MODELS
assert len(BATCH_SIZE_KNOWN_MODELS)
try:

View File

@ -296,8 +296,8 @@ class OperatorInputsLoader:
for key in self.operator_db.keys():
try:
op = eval(key)
except AttributeError:
log.warning("Evaluating an op name into an OpOverload", exc_info=True)
except AttributeError as ae:
log.warning("Evaluating an op name into an OpOverload: %s", ae)
continue
yield op

View File

@ -3,7 +3,6 @@ import sys
from benchmark_base import BenchmarkBase
import torch
from torch._dynamo.utils import CompileTimeInstructionCounter
class Benchmark(BenchmarkBase):
@ -33,11 +32,7 @@ class Benchmark(BenchmarkBase):
def _work(self):
# enable_cpp_symbolic_shape_guards has impact on this benchmark
# Keep using False value for consistency.
with (
torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", False),
torch._export.config.patch(use_new_tracer_experimental=True),
CompileTimeInstructionCounter.record(),
):
with torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", False):
torch.export.export(self.m, (self.input,), strict=True)

View File

@ -38,7 +38,7 @@ update_hint_regression,compile_time_instruction_count,1719000000,0.1
sum_floordiv_regression,compile_time_instruction_count,3686995725,0.1
sum_floordiv_regression,compile_time_instruction_count,966100000,0.1

1 add_loop_eager compile_time_instruction_count 3070000000 0.1
38
39
40
41
42
43
44

View File

@ -127,7 +127,7 @@ def trainbench(
bwd_time = bwd_start_event.elapsed_time(bwd_end_event)
return fwd_time, bwd_time
creator_args = {
creator_args = creator_args = {
"seqLength": seqLength,
"numLayers": numLayers,
"inputSize": inputSize,

View File

@ -12,7 +12,7 @@ def modeldef(request, net_name, executor, fuser):
# Given a 'net_name' provided by generate_tests, build the thing
name, rnn_creator, context = get_nn_runners(net_name)[0]
creator_args = {
creator_args = creator_args = {
"seqLength": 100,
"numLayers": 1,
"inputSize": 512,

View File

@ -85,7 +85,7 @@ class WeightOnlyInt8QuantHandler:
cur_state_dict[f"{fqn}.weight"] = int8_weight
cur_state_dict[f"{fqn}.scales"] = scales.to(mod.weight.dtype)
elif isinstance(mod, ConditionalFeedForward):
for weight_idx in range(3):
for weight_idx in range(0, 3):
weight_name = f"w{weight_idx + 1}"
scales_name = f"scales{weight_idx + 1}"
weight = getattr(mod, weight_name)

View File

@ -1729,8 +1729,10 @@ def define_buck_targets(
"torch/csrc/jit/backends/backend_debug_info.cpp",
"torch/csrc/jit/backends/backend_interface.cpp",
],
compiler_flags = get_pt_compiler_flags(),
fbandroid_compiler_flags = c2_fbandroid_xplat_compiler_flags,
compiler_flags = get_pt_compiler_flags() + select({
"DEFAULT": [],
"ovr_config//os:android": c2_fbandroid_xplat_compiler_flags
}),
# @lint-ignore BUCKLINT link_whole
link_whole = True,
linker_flags = get_no_as_needed_linker_flag(),
@ -2023,6 +2025,9 @@ def define_buck_targets(
"ovr_config//os:android-x86_64": [
"-mssse3",
],
}) + select({
"DEFAULT": [],
"ovr_config//os:android": c2_fbandroid_xplat_compiler_flags,
}),
exported_preprocessor_flags = get_aten_preprocessor_flags(),
exported_deps = [

View File

@ -9,7 +9,6 @@
#include <c10/core/Device.h>
#include <c10/core/DeviceType.h>
#include <c10/core/alignment.h>
#include <c10/macros/Export.h>
#include <c10/macros/Macros.h>
#include <c10/util/Exception.h>

View File

@ -1,4 +1,5 @@
#include <c10/core/AllocatorConfig.h>
#include <c10/core/DeviceType.h>
#include <c10/util/env.h>
namespace c10::CachingAllocator {
@ -46,7 +47,7 @@ size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) {
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart);
const size_t interval_end =
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd);
TORCH_CHECK_VALUE(
TORCH_CHECK(
interval_end - interval_start == kRoundUpPowerOfTwoIntervals,
"kRoundUpPowerOfTwoIntervals mismatch");
@ -65,7 +66,7 @@ size_t AcceleratorAllocatorConfig::parseMaxSplitSize(
std::numeric_limits<size_t>::max() / kMB;
size_t val_env = tokenizer.toSizeT(++i);
TORCH_CHECK_VALUE(
TORCH_CHECK(
val_env >= min_allowed_split_size_mb,
"CachingAllocator option max_split_size_mb too small, must be >= ",
min_allowed_split_size_mb);
@ -84,7 +85,7 @@ size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize(
std::numeric_limits<size_t>::max() / kMB;
size_t val_env = tokenizer.toSizeT(++i);
TORCH_CHECK_VALUE(
TORCH_CHECK(
val_env >= min_allowed_split_size_mb,
"CachingAllocator option max_non_split_rounding_mb too small, must be >= ",
min_allowed_split_size_mb);
@ -99,7 +100,7 @@ size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold(
size_t i) {
tokenizer.checkToken(++i, ":");
double val_env = tokenizer.toDouble(++i);
TORCH_CHECK_VALUE(
TORCH_CHECK(
val_env > 0 && val_env < 1.0,
"garbage_collect_threshold is invalid, set it in (0.0, 1.0)");
garbage_collection_threshold_ = val_env;
@ -120,7 +121,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
size_t value_index = i;
tokenizer.checkToken(++i, ":");
size_t value = tokenizer.toSizeT(++i);
TORCH_CHECK_VALUE(
TORCH_CHECK(
value == 0 || llvm::isPowerOf2_64(value),
"For roundups, the divisions has to be power of 2 or 0 to disable roundup ");
@ -128,13 +129,12 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
std::fill(
std::next(
roundup_power2_divisions_.begin(),
static_cast<std::vector<size_t>::difference_type>(
last_index + 1)),
static_cast<std::vector<size_t>::difference_type>(last_index)),
roundup_power2_divisions_.end(),
value);
} else {
size_t boundary = tokenizer.toSizeT(value_index);
TORCH_CHECK_VALUE(
TORCH_CHECK(
llvm::isPowerOf2_64(boundary),
"For roundups, the intervals have to be power of 2 ");
@ -164,7 +164,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
"Expected closing bracket ']' in ConfigTokenizer but reached end of config");
} else { // Keep this for backwards compatibility
size_t value = tokenizer.toSizeT(i);
TORCH_CHECK_VALUE(
TORCH_CHECK(
llvm::isPowerOf2_64(value),
"For roundups, the divisions has to be power of 2 ");
std::fill(
@ -224,7 +224,7 @@ void AcceleratorAllocatorConfig::parseArgs(const std::string& env) {
// If a device-specific configuration parser hook is registered, it will
// check if the key is unrecognized.
if (device_config_parser_hook_) {
TORCH_CHECK_VALUE(
TORCH_CHECK(
getKeys().find(key) != getKeys().end(),
"Unrecognized key '",
key,

View File

@ -76,7 +76,7 @@ class ConfigTokenizer {
} else if (token == "False") {
return false;
} else {
TORCH_CHECK_VALUE(
TORCH_CHECK(
false,
"Expected 'True' or 'False' at index ",
i,
@ -253,7 +253,7 @@ class C10_API AcceleratorAllocatorConfig {
device_config_parser_hook_ = std::move(hook);
auto& mutable_keys = getMutableKeys();
for (auto& key : keys) {
TORCH_CHECK_VALUE(
TORCH_CHECK(
mutable_keys.insert(key).second,
"Duplicated key '",
key,

View File

@ -52,9 +52,7 @@ constexpr DispatchKeySet math_dispatch_keyset = backend_dispatch_keyset |
// where we would like to support composite implicit kernels but not
// explicit kernels therefore we manually add the key to the
// math_dispatch_keyset
DispatchKeySet{DispatchKey::NestedTensor} |
// Functionalize should always reuse CompositeImplicit decomps.
DispatchKeySet{DispatchKey::Functionalize};
DispatchKeySet{DispatchKey::NestedTensor};
constexpr DispatchKeySet nested_dispatch_keyset =
DispatchKeySet(

View File

@ -102,7 +102,7 @@ uint64_t getNonDeterministicRandom(bool is_cuda) {
} else {
std::random_device rd;
// limit to 53 bits to ensure unique representation in double
s = (((static_cast<uint64_t>(rd())) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
s = ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
}
return s;
}

View File

@ -20,8 +20,7 @@ void maybeApplyRefcountedDeleter(const c10::Storage& storage) {
std::lock_guard<std::mutex> guard(replace_data_ptr_mutex);
c10::DataPtr& data_ptr = storage.mutable_data_ptr();
if (reinterpret_cast<const void*>(data_ptr.get_deleter()) ==
reinterpret_cast<const void*>(&c10::refcounted_deleter)) {
if ((void*)data_ptr.get_deleter() == (void*)&c10::refcounted_deleter) {
// Data pointer is already shared
return;
}

View File

@ -52,19 +52,6 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CONSTANT)
#undef DEFINE_CONSTANT
inline const char* toString(ScalarType t) {
#define DEFINE_CASE(_, name) \
case ScalarType::name: \
return #name;
switch (t) {
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CASE)
default:
return "UNKNOWN_SCALAR";
}
#undef DEFINE_CASE
}
inline size_t elementSize(ScalarType t) {
#define CASE_ELEMENTSIZE_CASE(ctype, name) \
case ScalarType::name: \
@ -308,12 +295,6 @@ inline bool canCast(const ScalarType from, const ScalarType to) {
C10_API ScalarType promoteTypes(ScalarType a, ScalarType b);
inline std::ostream& operator<<(
std::ostream& stream,
at::ScalarType scalar_type) {
return stream << toString(scalar_type);
}
// Returns a pair of strings representing the names for each dtype.
// The returned pair is (name, legacy_name_if_applicable)
C10_API std::pair<std::string, std::string> getDtypeNames(

View File

@ -4,6 +4,7 @@
#include <c10/core/SymNodeImpl.h>
#include <c10/util/intrusive_ptr.h>
#include <c10/util/safe_numerics.h>
#include <functional>
namespace c10 {
@ -83,7 +84,7 @@ DEFINE_BINARY(max_slow_path, sym_max, SymInt)
SymInt::operator SymFloat() const {
if (auto ma = maybe_as_int()) {
return SymFloat(static_cast<double>(*ma));
return SymFloat(double(*ma));
} else {
return SymFloat(toSymNodeImplUnowned()->sym_float());
}

View File

@ -9,6 +9,7 @@
#include <c10/core/impl/TorchDispatchModeTLS.h>
#include <c10/util/Logging.h>
#include <c10/util/accumulate.h>
#include <c10/util/irange.h>
#include <optional>
#include <utility>

View File

@ -1,5 +1,9 @@
#include <c10/core/TensorOptions.h>
#include <c10/core/Device.h>
#include <c10/core/Layout.h>
#include <c10/util/Optional.h>
#include <iostream>
namespace c10 {

View File

@ -1,7 +1,6 @@
#pragma once
#include <cstddef>
#include <new>
namespace c10 {
@ -19,12 +18,4 @@ constexpr size_t gPagesize = 4096;
// since the default thp pagesize is 2MB, enable thp only
// for buffers of size 2MB or larger to avoid memory bloating
constexpr size_t gAlloc_threshold_thp = static_cast<size_t>(2) * 1024 * 1024;
// Cache line size used to avoid false sharing between threads. Falls back to 64
// bytes if C++17 feature is unavailable.
#ifdef __cpp_lib_hardware_interference_size
using std::hardware_destructive_interference_size;
#else
constexpr std::size_t hardware_destructive_interference_size = 64;
#endif
} // namespace c10

View File

@ -2,6 +2,7 @@
#include <c10/core/Allocator.h>
#include <c10/core/StorageImpl.h>
#include <c10/core/alignment.h>
#include <c10/core/impl/COWDeleter.h>
#include <c10/util/Exception.h>
#include <c10/util/ParallelGuard.h>
@ -44,8 +45,7 @@ bool has_simple_data_ptr(const c10::StorageImpl& storage) {
}
bool is_cow_data_ptr(const c10::DataPtr& data_ptr) {
return reinterpret_cast<const void*>(data_ptr.get_deleter()) ==
reinterpret_cast<const void*>(&cow::cow_deleter);
return (void*)data_ptr.get_deleter() == (void*)&cow::cow_deleter;
}
c10::intrusive_ptr<StorageImpl> lazy_clone_storage(StorageImpl& storage) {

View File

@ -1,4 +1,5 @@
#include <c10/core/DispatchKey.h>
#include <c10/core/SafePyObject.h>
#include <c10/core/impl/LocalDispatchKeySet.h>
#include <c10/core/impl/TorchDispatchModeTLS.h>
#include <c10/util/irange.h>

View File

@ -20,7 +20,7 @@ size_t CUDAAllocatorConfig::parseAllocatorConfig(
tokenizer.checkToken(++i, ":");
i++; // Move to the value after the colon
#ifdef USE_ROCM
TORCH_CHECK_VALUE(
TORCH_CHECK(
((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1) ||
(tokenizer[i] == PYTORCH_TOKEN2)),
"Unknown allocator backend, "
@ -36,7 +36,7 @@ size_t CUDAAllocatorConfig::parseAllocatorConfig(
" != ",
get()->name());
#else // USE_ROCM
TORCH_CHECK_VALUE(
TORCH_CHECK(
((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1)),
"Unknown allocator backend, "
"options are native and " PYTORCH_TOKEN1);
@ -109,7 +109,7 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
} else {
const auto& keys =
c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
TORCH_CHECK_VALUE(
TORCH_CHECK(
keys.find(key) != keys.end(),
"Unrecognized key '",
key,
@ -151,12 +151,12 @@ size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
size_t i) {
tokenizer.checkToken(++i, ":");
size_t val2 = tokenizer.toSizeT(++i);
TORCH_CHECK_VALUE(
TORCH_CHECK(
llvm::isPowerOf2_64(val2),
"Number of register threads has to be power of 2, got ",
val2);
auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
TORCH_CHECK_VALUE(
TORCH_CHECK(
val2 <= maxThreads,
"Number of register threads should be less than or equal to ",
maxThreads,
@ -171,8 +171,7 @@ size_t CUDAAllocatorConfig::parsePinnedReserveSegmentSize(
size_t i) {
tokenizer.checkToken(++i, ":");
size_t val2 = tokenizer.toSizeT(++i);
TORCH_CHECK_VALUE(
val2 > 0, "Pinned reserve segment size has to be greater than 0");
TORCH_CHECK(val2 > 0, "Pinned reserve segment size has to be greater than 0");
m_pinned_reserve_segment_size_mb = val2;
return i;
}

View File

@ -3,7 +3,6 @@
#include <c10/core/AllocatorConfig.h>
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/CUDAMacros.h>
#include <c10/util/Deprecated.h>
#include <c10/util/Exception.h>
#include <c10/util/env.h>
@ -18,14 +17,9 @@ enum class Expandable_Segments_Handle_Type : int {
// Environment config parser
class C10_CUDA_API CUDAAllocatorConfig {
public:
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_split_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size() instead.")
static size_t max_split_size() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size();
}
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::garbage_collection_threshold() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::garbage_collection_threshold() instead.")
static double garbage_collection_threshold() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
garbage_collection_threshold();
@ -70,8 +64,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
return instance().m_pinned_num_register_threads;
}
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_use_background_threads() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::pinned_use_background_threads() instead.")
static bool pinned_use_background_threads() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
pinned_use_background_threads();
@ -88,15 +80,11 @@ class C10_CUDA_API CUDAAllocatorConfig {
return 128;
}
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
static size_t roundup_power2_divisions(size_t size) {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
roundup_power2_divisions(size);
}
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
static std::vector<size_t> roundup_power2_divisions() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
roundup_power2_divisions();
@ -107,8 +95,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
max_non_split_rounding_size();
}
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::last_allocator_settings() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::last_allocator_settings() instead.")
static std::string last_allocator_settings() {
return c10::CachingAllocator::getAllocatorSettings();
}

View File

@ -512,7 +512,7 @@ struct ExpandableSegment {
header.segment_size = segment_size_;
header.num_handles = end - begin;
buf.write(reinterpret_cast<const char*>(&header), sizeof(ShareHeader));
buf.write((const char*)&header, sizeof(ShareHeader));
for (auto i : c10::irange(begin, end)) {
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
auto& handle = handles_.at(i).value();
@ -528,9 +528,7 @@ struct ExpandableSegment {
TORCH_CHECK(
handle.shareable_handle != std::nullopt,
"shareable_handle is null");
buf.write(
reinterpret_cast<const char*>(&*handle.shareable_handle),
sizeof(int));
buf.write((const char*)&*handle.shareable_handle, sizeof(int));
} else {
if (!handle.shareable_handle) {
CUmemFabricHandle fabric_handle;
@ -543,8 +541,7 @@ struct ExpandableSegment {
handle.shareable_handle != std::nullopt,
"shareable_handle is null");
buf.write(
reinterpret_cast<const char*>(&*handle.shareable_handle),
sizeof(CUmemFabricHandle));
(const char*)&*handle.shareable_handle, sizeof(CUmemFabricHandle));
}
}
return rangeFromHandles(begin, end);
@ -555,7 +552,7 @@ struct ExpandableSegment {
std::vector<c10::DeviceIndex> peers,
std::istream& buf) {
ShareHeader header{};
buf.read(reinterpret_cast<char*>(&header), sizeof(ShareHeader));
buf.read((char*)&header, sizeof(ShareHeader));
auto segment = std::make_unique<ExpandableSegment>(
device, std::nullopt, header.segment_size, std::move(peers));
// older build setups (e.g. multiwheels) do not have this syscall, added 2020
@ -577,11 +574,11 @@ struct ExpandableSegment {
for (auto i : c10::irange(header.num_handles)) {
(void)i;
int fd = 0;
buf.read(reinterpret_cast<char*>(&fd), sizeof(int));
buf.read((char*)&fd, sizeof(int));
auto myfd = syscall(SYS_pidfd_getfd, pidfd, fd, 0);
if (myfd == -1) {
auto err = errno;
close(static_cast<int>(pidfd));
close((int)pidfd);
for (auto& h : segment->handles_) {
C10_CUDA_DRIVER_CHECK(
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
@ -601,16 +598,15 @@ struct ExpandableSegment {
(void*)(uintptr_t)myfd,
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
LOG(INFO) << "use posix fd to import expandable segments.";
close(static_cast<int>(myfd));
close((int)myfd);
segment->handles_.emplace_back(Handle{handle, std::nullopt});
}
close(static_cast<int>(pidfd));
close((int)pidfd);
} else {
for (auto i : c10::irange(header.num_handles)) {
(void)i;
CUmemFabricHandle fabric_handle;
buf.read(
reinterpret_cast<char*>(&fabric_handle), sizeof(CUmemFabricHandle));
buf.read((char*)&fabric_handle, sizeof(CUmemFabricHandle));
CUmemGenericAllocationHandle handle = 0;
C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemImportFromShareableHandle_(
&handle,
@ -941,7 +937,7 @@ class EventPool {
private:
struct PerDevicePool {
alignas(hardware_destructive_interference_size) std::mutex mutex_;
alignas(64) std::mutex mutex_;
std::vector<std::unique_ptr<cudaEvent_t>> event_pool_;
};
std::vector<PerDevicePool> pools_;
@ -1063,7 +1059,7 @@ class RingBuffer {
void setMaxEntries(size_t size) {
std::lock_guard<std::mutex> lk(alloc_trace_lock);
alloc_trace_max_entries_ = std::max(static_cast<size_t>(1), size);
alloc_trace_max_entries_ = std::max(size_t(1), size);
}
void insertEntries(const T& entry) {
@ -1264,9 +1260,6 @@ class DeviceCachingAllocator {
// thread local compile context for each device
static thread_local std::stack<std::string> compile_context;
// thread local user metadata for annotating allocations
static thread_local std::string user_metadata;
public:
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
explicit DeviceCachingAllocator(c10::DeviceIndex id)
@ -1274,7 +1267,7 @@ class DeviceCachingAllocator {
large_blocks(/*small=*/false),
small_blocks(/*small=*/true) {
stats.max_split_size =
static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
static_cast<int64_t>(CUDAAllocatorConfig::max_split_size());
context_recorder_.store(nullptr);
}
@ -1309,14 +1302,6 @@ class DeviceCachingAllocator {
}
}
void setUserMetadata(const std::string& metadata) {
user_metadata = metadata;
}
std::string getUserMetadata() {
return user_metadata;
}
bool checkPoolLiveAllocations(
MempoolId_t mempool_id,
const std::unordered_set<void*>& expected_live_allocations) const {
@ -1409,8 +1394,7 @@ class DeviceCachingAllocator {
// Do garbage collection if the flag is set.
if (C10_UNLIKELY(
set_fraction &&
AcceleratorAllocatorConfig::garbage_collection_threshold() >
0.0)) {
CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
garbage_collect_cached_blocks(context);
}
// Attempt allocate
@ -1662,7 +1646,7 @@ class DeviceCachingAllocator {
stats.active_bytes[stat_type].increase(block->size);
stats.requested_bytes[stat_type].increase(block->requested_size);
});
if (block->size >= AcceleratorAllocatorConfig::max_split_size())
if (block->size >= CUDAAllocatorConfig::max_split_size())
stats.oversize_allocations.increase(1);
auto allocated_bytes_gauge =
@ -1931,7 +1915,7 @@ class DeviceCachingAllocator {
block->pool->owner_MempoolId(),
context ? context : block->context_when_allocated);
if (block->size >= AcceleratorAllocatorConfig::max_split_size())
if (block->size >= CUDAAllocatorConfig::max_split_size())
stats.oversize_allocations.decrease(1);
// If the block has been used on more than one stream, handle accordingly.
@ -1995,16 +1979,15 @@ class DeviceCachingAllocator {
while (base_block->prev) {
base_block = base_block->prev;
}
offset = static_cast<const char*>(block->ptr) -
static_cast<const char*>(base_block->ptr);
offset = (char*)block->ptr - (char*)base_block->ptr;
cudaIpcMemHandle_t handle;
C10_CUDA_CHECK(cudaIpcGetMemHandle(&handle, base_block->ptr));
ss.write(reinterpret_cast<const char*>(&handle), CUDA_IPC_HANDLE_SIZE);
ss.write((char*)&handle, CUDA_IPC_HANDLE_SIZE);
} else {
ss.put(SHAREABLE_CUDA_EXPANDABLE_SEGMENT);
auto full_range = block->expandable_segment_->share(
SegmentRange(block->ptr, block->size), ss);
offset = static_cast<const char*>(block->ptr) - full_range.ptr;
offset = (char*)block->ptr - full_range.ptr;
}
return ShareableHandle{offset, ss.str()};
}
@ -2505,8 +2488,7 @@ class DeviceCachingAllocator {
if (size < kMinBlockSize) {
return kMinBlockSize;
} else {
auto divisions =
AcceleratorAllocatorConfig::roundup_power2_divisions(size);
auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size);
if (divisions > 1 && size > (kMinBlockSize * divisions)) {
return roundup_power2_next_division(size, divisions);
} else {
@ -3000,7 +2982,7 @@ class DeviceCachingAllocator {
if (block->pool->is_small || CUDAAllocatorConfig::expandable_segments()) {
return remaining >= kMinBlockSize;
} else {
return (size < AcceleratorAllocatorConfig::max_split_size()) &&
return (size < CUDAAllocatorConfig::max_split_size()) &&
(remaining > kSmallSize);
}
}
@ -3020,7 +3002,7 @@ class DeviceCachingAllocator {
if (C10_UNLIKELY(
set_fraction &&
AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
// Track block reuse interval only when garbage collection is enabled.
++pool.get_free_blocks_call_count;
}
@ -3062,13 +3044,13 @@ class DeviceCachingAllocator {
}
// Do not return an oversized block for a large request
if ((p.size() < AcceleratorAllocatorConfig::max_split_size()) &&
((*it)->size >= AcceleratorAllocatorConfig::max_split_size()))
if ((p.size() < CUDAAllocatorConfig::max_split_size()) &&
((*it)->size >= CUDAAllocatorConfig::max_split_size()))
return false;
// Allow oversized block size to be rounded up but within a limit
if ((p.size() >= AcceleratorAllocatorConfig::max_split_size()) &&
if ((p.size() >= CUDAAllocatorConfig::max_split_size()) &&
((*it)->size >=
p.size() + AcceleratorAllocatorConfig::max_non_split_rounding_size()))
p.size() + CUDAAllocatorConfig::max_non_split_rounding_size()))
return false;
p.block = *it;
pool.blocks.erase(it);
@ -3091,7 +3073,7 @@ class DeviceCachingAllocator {
// therefore should be of less overheads.
size_t gc_threshold = static_cast<size_t>(
AcceleratorAllocatorConfig::garbage_collection_threshold() *
CUDAAllocatorConfig::garbage_collection_threshold() *
static_cast<double>(allowed_memory_maximum));
// No need to trigger GC yet
if (total_allocated_memory <= gc_threshold) {
@ -3234,13 +3216,12 @@ class DeviceCachingAllocator {
}
total_allocated_memory += size;
p.block = new Block(
p.device(), p.stream(), size, p.pool, static_cast<char*>(ptr));
p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
stats.segment[stat_type].increase(1);
stats.reserved_bytes[stat_type].increase(size);
});
if (size >= AcceleratorAllocatorConfig::max_split_size())
if (size >= CUDAAllocatorConfig::max_split_size())
stats.oversize_segments.increase(1);
auto reserved_bytes_gauge =
STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
@ -3269,7 +3250,7 @@ class DeviceCachingAllocator {
bool release_available_cached_blocks(
const AllocParams& p,
const std::shared_ptr<GatheredContext>& context) {
if (AcceleratorAllocatorConfig::max_split_size() ==
if (CUDAAllocatorConfig::max_split_size() ==
std::numeric_limits<size_t>::max())
return false;
BlockPool& pool = *p.pool;
@ -3277,8 +3258,8 @@ class DeviceCachingAllocator {
// because of std::unique_ptr, block cannot be trivially copied
// Use constructor for search key.
Block key(p.search_key.device, p.search_key.stream, p.search_key.size);
key.size = (key.size < AcceleratorAllocatorConfig::max_split_size())
? AcceleratorAllocatorConfig::max_split_size()
key.size = (key.size < CUDAAllocatorConfig::max_split_size())
? CUDAAllocatorConfig::max_split_size()
: key.size;
auto it = pool.blocks.lower_bound(&key);
if (it == pool.blocks.end() || (*it)->stream != p.stream() ||
@ -3291,7 +3272,7 @@ class DeviceCachingAllocator {
--it; // Back up one item. Now on the largest block for the correct
// stream
while ((totalReleased < key.size) &&
((*it)->size >= AcceleratorAllocatorConfig::max_split_size()) &&
((*it)->size >= CUDAAllocatorConfig::max_split_size()) &&
((*it)->stream == p.stream())) {
auto cur = it;
bool is_first = cur == pool.blocks.begin();
@ -3416,7 +3397,7 @@ class DeviceCachingAllocator {
stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
.current);
if (block->size >= AcceleratorAllocatorConfig::max_split_size())
if (block->size >= CUDAAllocatorConfig::max_split_size())
stats.oversize_segments.decrease(1);
pool->blocks.erase(block);
delete block;
@ -3701,8 +3682,7 @@ class DeviceCachingAllocator {
mempool_id,
getApproximateTime(),
record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr,
compile_string,
user_metadata);
compile_string);
// Callbacks should not include any Pytorch call
for (const auto& cb : trace_trackers_) {
@ -3757,7 +3737,11 @@ static void uncached_delete(void* ptr) {
static void local_raw_delete(void* ptr);
thread_local std::stack<std::string> DeviceCachingAllocator::compile_context;
thread_local std::string DeviceCachingAllocator::user_metadata;
#ifdef __cpp_lib_hardware_interference_size
using std::hardware_destructive_interference_size;
#else
static constexpr std::size_t hardware_destructive_interference_size = 64;
#endif
class NativeCachingAllocator : public CUDAAllocator {
private:
@ -3778,7 +3762,7 @@ class NativeCachingAllocator : public CUDAAllocator {
allocated_blocks;
static size_t get_mutex_shard_id(void* ptr) {
return twang_mix64(reinterpret_cast<uintptr_t>(ptr)) % kNumMutexShard;
return twang_mix64((size_t)ptr) % kNumMutexShard;
}
void add_allocated_block(Block* block) {
@ -3815,8 +3799,8 @@ class NativeCachingAllocator : public CUDAAllocator {
if (size < device_count) {
device_allocator.resize(device_count);
for (const auto i : c10::irange(size, device_count)) {
device_allocator[i] = std::make_unique<DeviceCachingAllocator>(
static_cast<c10::DeviceIndex>(i));
device_allocator[i] =
std::make_unique<DeviceCachingAllocator>(c10::DeviceIndex(i));
}
}
}
@ -3950,18 +3934,6 @@ class NativeCachingAllocator : public CUDAAllocator {
device_allocator[device]->popCompileContext();
}
void setUserMetadata(const std::string& metadata) override {
c10::DeviceIndex device = 0;
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
device_allocator[device]->setUserMetadata(metadata);
}
std::string getUserMetadata() override {
c10::DeviceIndex device = 0;
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
return device_allocator[device]->getUserMetadata();
}
bool isHistoryEnabled() override {
c10::DeviceIndex device = 0;
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
@ -4062,8 +4034,8 @@ class NativeCachingAllocator : public CUDAAllocator {
auto& md = result.config_metadata;
md.garbage_collection_threshold =
AcceleratorAllocatorConfig::garbage_collection_threshold();
md.max_split_size = AcceleratorAllocatorConfig::max_split_size();
CUDAAllocatorConfig::garbage_collection_threshold();
md.max_split_size = CUDAAllocatorConfig::max_split_size();
md.pinned_num_register_threads =
CUDAAllocatorConfig::pinned_num_register_threads();
md.expandable_segments = CUDAAllocatorConfig::expandable_segments();
@ -4071,12 +4043,11 @@ class NativeCachingAllocator : public CUDAAllocator {
CUDAAllocatorConfig::release_lock_on_cudamalloc();
md.pinned_use_host_register =
CUDAAllocatorConfig::pinned_use_cuda_host_register();
md.last_allocator_settings =
AcceleratorAllocatorConfig::last_allocator_settings();
md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings();
md.graph_capture_record_stream_reuse =
CUDAAllocatorConfig::graph_capture_record_stream_reuse();
md.roundup_power2_divisions =
AcceleratorAllocatorConfig::roundup_power2_divisions();
CUDAAllocatorConfig::roundup_power2_divisions();
return result;
}
@ -4345,7 +4316,7 @@ class NativeCachingAllocator : public CUDAAllocator {
// SHARABLE_CUDA_MALLOC
if (type == SHAREABLE_CUDA_MALLOC) {
cudaIpcMemHandle_t cuda_handle;
ss.read(reinterpret_cast<char*>(&cuda_handle), CUDA_IPC_HANDLE_SIZE);
ss.read((char*)&cuda_handle, CUDA_IPC_HANDLE_SIZE);
C10_CUDA_CHECK(cudaIpcOpenMemHandle(
&cuda_ipc_ptr_, cuda_handle, cudaIpcMemLazyEnablePeerAccess));
} else if (type == SHAREABLE_CUDA_EXPANDABLE_SEGMENT) {
@ -4454,12 +4425,11 @@ CUDAAllocator* allocator();
} // namespace CudaMallocAsync
struct BackendStaticInitializer {
// Parses the environment configuration for CUDA/ROCm allocator backend at
// load time. This duplicates some logic from CUDAAllocatorConfig to ensure
// lazy initialization without triggering global static constructors. The
// function looks for the key "backend" and returns the appropriate allocator
// instance based on its value. If no valid configuration is found, it falls
// back to the default Native allocator.
// Parses env for backend at load time, duplicating some logic from
// CUDAAllocatorConfig. CUDAAllocatorConfig double-checks it later (at
// runtime). Defers verbose exceptions and error checks, including Cuda
// version checks, to CUDAAllocatorConfig's runtime doublecheck. If this
// works, maybe we should move all of CUDAAllocatorConfig here?
CUDAAllocator* parseEnvForBackend() {
auto val = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
#ifdef USE_ROCM
@ -4468,35 +4438,34 @@ struct BackendStaticInitializer {
val = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
}
#endif
if (!val.has_value()) {
val = c10::utils::get_env("PYTORCH_ALLOC_CONF");
}
if (val.has_value()) {
c10::CachingAllocator::ConfigTokenizer tokenizer(val.value());
for (size_t i = 0; i < tokenizer.size(); i++) {
const auto& key = tokenizer[i];
if (key == "backend") {
tokenizer.checkToken(++i, ":");
i++; // Move to the value after the colon
if (tokenizer[i] == "cudaMallocAsync"
const std::string& config = val.value();
std::regex exp("[\\s,]+");
std::sregex_token_iterator it(config.begin(), config.end(), exp, -1);
std::sregex_token_iterator end;
std::vector<std::string> options(it, end);
for (auto option : options) {
std::regex exp2("[:]+");
std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1);
std::sregex_token_iterator end2;
std::vector<std::string> kv(it2, end2);
if (kv.size() >= 2) {
if (kv[0] == "backend") {
#ifdef USE_ROCM
// convenience for ROCm users to allow either CUDA or HIP env var
|| tokenizer[i] == "hipMallocAsync"
// convenience for ROCm users to allow either CUDA or HIP env var
if (kv[1] == "cudaMallocAsync" || kv[1] == "hipMallocAsync")
#else
if (kv[1] == "cudaMallocAsync")
#endif
) {
return CudaMallocAsync::allocator();
return CudaMallocAsync::allocator();
if (kv[1] == "native")
return &Native::allocator;
}
break;
} else {
// Skip the key and its value
i = tokenizer.skipKey(i);
}
if (i + 1 < tokenizer.size()) {
tokenizer.checkToken(++i, ",");
}
}
}
// Default fallback allocator.
return &Native::allocator;
}

View File

@ -118,8 +118,7 @@ struct TraceEntry {
MempoolId_t mempool,
approx_time_t time,
std::shared_ptr<GatheredContext> context = nullptr,
std::string compile_context = "",
std::string user_metadata = "")
std::string compile_context = "")
: action_(action),
device_(device),
addr_(addr),
@ -127,8 +126,7 @@ struct TraceEntry {
stream_(stream),
size_(size),
mempool_(std::move(mempool)),
compile_context_(std::move(compile_context)),
user_metadata_(std::move(user_metadata)) {
compile_context_(std::move(compile_context)) {
time_.approx_t_ = time;
}
Action action_;
@ -140,7 +138,6 @@ struct TraceEntry {
MempoolId_t mempool_;
trace_time_ time_{};
std::string compile_context_;
std::string user_metadata_;
};
// Calls made by record_function will save annotations
@ -300,10 +297,6 @@ class CUDAAllocator : public DeviceAllocator {
const std::vector<std::pair<std::string, std::string>>& /*md*/) {}
virtual void pushCompileContext(std::string& md) {}
virtual void popCompileContext() {}
virtual void setUserMetadata(const std::string& metadata) {}
virtual std::string getUserMetadata() {
return "";
}
virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
// Attached AllocatorTraceTracker callbacks will be called while the
@ -543,14 +536,6 @@ inline void enablePeerAccess(
get()->enablePeerAccess(dev, dev_to_access);
}
inline void setUserMetadata(const std::string& metadata) {
get()->setUserMetadata(metadata);
}
inline std::string getUserMetadata() {
return get()->getUserMetadata();
}
} // namespace c10::cuda::CUDACachingAllocator
namespace c10::cuda {

View File

@ -1,6 +1,8 @@
#include <c10/cuda/CUDADeviceAssertionHost.h>
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/CUDAFunctions.h>
#include <c10/util/Backtrace.h>
#include <c10/util/Exception.h>
#include <c10/util/env.h>
#include <c10/util/irange.h>
#include <cuda_runtime.h>

View File

@ -4,6 +4,7 @@
#include <c10/cuda/CUDAGuard.h>
#include <c10/util/UniqueVoidPtr.h>
#include <c10/util/flat_hash_map.h>
#include <c10/util/irange.h>
#include <unordered_set>
#include <vector>
@ -46,7 +47,7 @@ bool operator==(const UsageStream& lhs, const UsageStream& rhs) {
struct UsageStreamHash {
size_t operator()(const UsageStream& us) const noexcept {
return std::hash<void*>{}(us.stream) + static_cast<size_t>(us.device);
return std::hash<void*>{}(us.stream) + size_t(us.device);
}
};

View File

@ -1,6 +1,7 @@
#include <c10/cuda/CUDAMiscFunctions.h>
#include <c10/util/env.h>
#include <cuda_runtime.h>
#include <cstring>
#include <string>
namespace c10::cuda {

View File

@ -128,7 +128,7 @@ std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
} else if (s.isExt()) {
stream << "EXT";
} else {
stream << "PRIORITY " << static_cast<int>(s.getStreamType());
stream << "PRIORITY " << int(s.getStreamType());
}
return stream;
}

View File

@ -1,6 +1,7 @@
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/driver_api.h>
#include <c10/util/CallOnce.h>
#include <c10/util/Exception.h>
#include <c10/util/Logging.h>
#include <cuda_runtime.h>

View File

@ -328,21 +328,5 @@ struct pair {
T2 second;
};
#define INSTANTIATE_FOR_ALL_TYPES(MACRO) \
MACRO(float); \
MACRO(half); \
MACRO(bfloat); \
MACRO(float2); \
MACRO(long); \
MACRO(char); \
MACRO(uchar); \
MACRO(short); \
MACRO(int);
#define INSTANTIATE_FOR_FLOAT_TYPES(MACRO) \
MACRO(float); \
MACRO(half); \
MACRO(bfloat);
} // namespace metal
} // namespace c10

Some files were not shown because too many files have changed in this diff Show More