Build Triton in Docker image (#95233)

See a bunch of timeout error when trying to clone and build Triton today c6d8d10b3e, so let's build triton as part of the Docker image.

* The pinned commit file is moved to the Docker context at `.ci/docker/ci_commit_pins/triton.txt`, and `.github/ci_commit_pins/triton.txt` is now a soft link pointing to it
* New Docker images are built whenever the pinned commit is updated
* The build logic is in `.ci/docker/common/install_triton.sh` which copies `install_triton` step in the CI.  The latter can be removed in a separate PR after this one

Pull Request resolved: https://github.com/pytorch/pytorch/pull/95233
Approved by: https://github.com/weiwangmeta, https://github.com/malfet
This commit is contained in:
Huy Do
2023-02-28 22:01:37 +00:00
committed by PyTorch MergeBot
parent b55d0d2aef
commit ba43d908f9
9 changed files with 98 additions and 2 deletions

View File

@ -100,6 +100,7 @@ case "$image" in
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7)
CUDA_VERSION=11.7.0
@ -113,6 +114,7 @@ case "$image" in
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7)
CUDA_VERSION=11.8.0
@ -126,6 +128,7 @@ case "$image" in
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-py3-clang7-asan)
ANACONDA_PYTHON_VERSION=3.9
@ -134,6 +137,7 @@ case "$image" in
DB=yes
VISION=yes
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-py3-clang10-onnx)
ANACONDA_PYTHON_VERSION=3.8
@ -162,6 +166,7 @@ case "$image" in
VULKAN_SDK_VERSION=1.2.162.1
SWIFTSHADER=yes
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-bionic-py3.11-clang9)
ANACONDA_PYTHON_VERSION=3.11
@ -172,6 +177,7 @@ case "$image" in
VULKAN_SDK_VERSION=1.2.162.1
SWIFTSHADER=yes
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-bionic-py3.8-gcc9)
ANACONDA_PYTHON_VERSION=3.8
@ -180,6 +186,7 @@ case "$image" in
DB=yes
VISION=yes
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-rocm-n-1-py3)
ANACONDA_PYTHON_VERSION=3.8
@ -209,6 +216,7 @@ case "$image" in
VISION=yes
KATEX=yes
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-jammy-cuda11.6-cudnn8-py3.8-clang12)
ANACONDA_PYTHON_VERSION=3.8
@ -218,6 +226,7 @@ case "$image" in
PROTOBUF=yes
DB=yes
VISION=yes
TRITON=yes
;;
pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12)
ANACONDA_PYTHON_VERSION=3.8
@ -227,6 +236,7 @@ case "$image" in
PROTOBUF=yes
DB=yes
VISION=yes
TRITON=yes
;;
pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12)
ANACONDA_PYTHON_VERSION=3.8
@ -236,6 +246,7 @@ case "$image" in
PROTOBUF=yes
DB=yes
VISION=yes
TRITON=yes
;;
pytorch-linux-focal-linter)
# TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
@ -328,6 +339,7 @@ docker build \
--build-arg "UCX_COMMIT=${UCX_COMMIT}" \
--build-arg "UCC_COMMIT=${UCC_COMMIT}" \
--build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
--build-arg "TRITON=${TRITON}" \
-f $(dirname ${DOCKERFILE})/Dockerfile \
-t "$tmp_tag" \
"$@" \

View File

@ -0,0 +1 @@
b8b470bc597c1c5bd03682c09fe3e6b7c53787fd

View File

@ -13,7 +13,7 @@ as_jenkins() {
# NB: Pass on PATH and LD_LIBRARY_PATH to sudo invocation
# NB: This must be run from a directory that jenkins has access to,
# works around https://github.com/conda/conda-package-handling/pull/34
$SUDO -H -u jenkins env -u SUDO_UID -u SUDO_GID -u SUDO_COMMAND -u SUDO_USER env "PATH=$PATH" "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" $*
$SUDO -E -H -u jenkins env -u SUDO_UID -u SUDO_GID -u SUDO_COMMAND -u SUDO_USER env "PATH=$PATH" "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" $*
}
conda_install() {
@ -30,3 +30,7 @@ conda_run() {
pip_install() {
as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION pip install --progress-bar off $*
}
get_pinned_commit() {
cat "${1}".txt
}

View File

@ -0,0 +1,54 @@
#!/bin/bash
set -ex
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
get_conda_version() {
as_jenkins conda list -n py_$ANACONDA_PYTHON_VERSION | grep -w $* | head -n 1 | awk '{print $2}'
}
conda_reinstall() {
as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $*
}
# The logic here is copied from .ci/pytorch/common_utils.sh
TRITON_PINNED_COMMIT=$(get_pinned_commit triton)
apt update
apt-get install -y gpg-agent
if [ -n "${CONDA_CMAKE}" ]; then
# Keep the current cmake and numpy version here, so we can reinstall them later
CMAKE_VERSION=$(get_conda_version cmake)
NUMPY_VERSION=$(get_conda_version numpy)
fi
if [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}" == "7" ]]; then
# Triton needs at least gcc-9 to build
apt-get install -y g++-9
CXX=g++-9 pip_install "git+https://github.com/openai/triton@${TRITON_PINNED_COMMIT}#subdirectory=python"
elif [ -n "${CLANG_VERSION}" ]; then
# Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
add-apt-repository -y ppa:ubuntu-toolchain-r/test
apt-get install -y g++-9
CXX=g++-9 pip_install "git+https://github.com/openai/triton@${TRITON_PINNED_COMMIT}#subdirectory=python"
else
pip_install "git+https://github.com/openai/triton@${TRITON_PINNED_COMMIT}#subdirectory=python"
fi
if [ -n "${CONDA_CMAKE}" ]; then
# TODO: This is to make sure that the same cmake and numpy version from install conda
# script is used. Without this step, the newer cmake version (3.25.2) downloaded by
# triton build step via pip will fail to detect conda MKL. Once that issue is fixed,
# this can be removed.
#
# The correct numpy version also needs to be set here because conda claims that it
# causes inconsistent environment. Without this, conda will attempt to install the
# latest numpy version, which fails ASAN tests with the following import error: Numba
# needs NumPy 1.20 or less.
conda_reinstall cmake="${CMAKE_VERSION}"
conda_reinstall numpy="${NUMPY_VERSION}"
fi

View File

@ -258,3 +258,8 @@ ghstack==0.7.1
#Description: ghstack tool
#Pinned versions: 0.7.1
#test that import:
jinja2==3.1.2
#Description: jinja2 template engine
#Pinned versions: 3.1.2
#test that import:

View File

@ -85,6 +85,15 @@ COPY ./common/install_cmake.sh install_cmake.sh
RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
RUN rm install_cmake.sh
ARG TRITON
# Install triton, this needs to be done before sccache because the latter will
# try to reach out to S3, which docker build runners don't have access
COPY ./common/install_triton.sh install_triton.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/triton.txt triton.txt
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
RUN rm install_triton.sh common_utils.sh triton.txt
# Install ccache/sccache (do this last, so we get priority in PATH)
COPY ./common/install_cache.sh install_cache.sh
ENV PATH /opt/cache/bin:$PATH

View File

@ -134,6 +134,15 @@ ENV OPENSSL_ROOT_DIR /opt/openssl
ENV OPENSSL_DIR /opt/openssl
RUN rm install_openssl.sh
ARG TRITON
# Install triton, this needs to be done before sccache because the latter will
# try to reach out to S3, which docker build runners don't have access
COPY ./common/install_triton.sh install_triton.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/triton.txt triton.txt
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
RUN rm install_triton.sh common_utils.sh triton.txt
# Install ccache/sccache (do this last, so we get priority in PATH)
COPY ./common/install_cache.sh install_cache.sh
ENV PATH /opt/cache/bin:$PATH

View File

@ -1 +0,0 @@
b8b470bc597c1c5bd03682c09fe3e6b7c53787fd

1
.github/ci_commit_pins/triton.txt vendored Symbolic link
View File

@ -0,0 +1 @@
../../.ci/docker/ci_commit_pins/triton.txt

View File

@ -9,11 +9,13 @@ on:
- .github/workflows/build-triton-wheel.yml
- .github/scripts/build_triton_wheel.py
- .github/ci_commit_pins/triton.txt
- .ci/docker/ci_commit_pins/triton.txt
pull_request:
paths:
- .github/workflows/build-triton-wheel.yml
- .github/scripts/build_triton_wheel.py
- .github/ci_commit_pins/triton.txt
- .ci/docker/ci_commit_pins/triton.txt
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}