From ba43d908f9a9950a89dfaaf88a0372489adb42fb Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 28 Feb 2023 22:01:37 +0000 Subject: [PATCH] Build Triton in Docker image (#95233) See a bunch of timeout error when trying to clone and build Triton today https://hud.pytorch.org/pytorch/pytorch/commit/c6d8d10b3e974019dae7ec91a85c6192c6d511fa, so let's build triton as part of the Docker image. * The pinned commit file is moved to the Docker context at `.ci/docker/ci_commit_pins/triton.txt`, and `.github/ci_commit_pins/triton.txt` is now a soft link pointing to it * New Docker images are built whenever the pinned commit is updated * The build logic is in `.ci/docker/common/install_triton.sh` which copies `install_triton` step in the CI. The latter can be removed in a separate PR after this one Pull Request resolved: https://github.com/pytorch/pytorch/pull/95233 Approved by: https://github.com/weiwangmeta, https://github.com/malfet --- .ci/docker/build.sh | 12 ++++++ .ci/docker/ci_commit_pins/triton.txt | 1 + .ci/docker/common/common_utils.sh | 6 ++- .ci/docker/common/install_triton.sh | 54 ++++++++++++++++++++++++ .ci/docker/requirements-ci.txt | 5 +++ .ci/docker/ubuntu-cuda/Dockerfile | 9 ++++ .ci/docker/ubuntu/Dockerfile | 9 ++++ .github/ci_commit_pins/triton.txt | 2 +- .github/workflows/build-triton-wheel.yml | 2 + 9 files changed, 98 insertions(+), 2 deletions(-) create mode 100644 .ci/docker/ci_commit_pins/triton.txt create mode 100755 .ci/docker/common/install_triton.sh mode change 100644 => 120000 .github/ci_commit_pins/triton.txt diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index ffddc546ebf3..ca6847567c36 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -100,6 +100,7 @@ case "$image" in UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} CONDA_CMAKE=yes + TRITON=yes ;; pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7) CUDA_VERSION=11.7.0 @@ -113,6 +114,7 @@ case "$image" in UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} CONDA_CMAKE=yes + TRITON=yes ;; pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7) CUDA_VERSION=11.8.0 @@ -126,6 +128,7 @@ case "$image" in UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} CONDA_CMAKE=yes + TRITON=yes ;; pytorch-linux-focal-py3-clang7-asan) ANACONDA_PYTHON_VERSION=3.9 @@ -134,6 +137,7 @@ case "$image" in DB=yes VISION=yes CONDA_CMAKE=yes + TRITON=yes ;; pytorch-linux-focal-py3-clang10-onnx) ANACONDA_PYTHON_VERSION=3.8 @@ -162,6 +166,7 @@ case "$image" in VULKAN_SDK_VERSION=1.2.162.1 SWIFTSHADER=yes CONDA_CMAKE=yes + TRITON=yes ;; pytorch-linux-bionic-py3.11-clang9) ANACONDA_PYTHON_VERSION=3.11 @@ -172,6 +177,7 @@ case "$image" in VULKAN_SDK_VERSION=1.2.162.1 SWIFTSHADER=yes CONDA_CMAKE=yes + TRITON=yes ;; pytorch-linux-bionic-py3.8-gcc9) ANACONDA_PYTHON_VERSION=3.8 @@ -180,6 +186,7 @@ case "$image" in DB=yes VISION=yes CONDA_CMAKE=yes + TRITON=yes ;; pytorch-linux-focal-rocm-n-1-py3) ANACONDA_PYTHON_VERSION=3.8 @@ -209,6 +216,7 @@ case "$image" in VISION=yes KATEX=yes CONDA_CMAKE=yes + TRITON=yes ;; pytorch-linux-jammy-cuda11.6-cudnn8-py3.8-clang12) ANACONDA_PYTHON_VERSION=3.8 @@ -218,6 +226,7 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes + TRITON=yes ;; pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12) ANACONDA_PYTHON_VERSION=3.8 @@ -227,6 +236,7 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes + TRITON=yes ;; pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12) ANACONDA_PYTHON_VERSION=3.8 @@ -236,6 +246,7 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes + TRITON=yes ;; pytorch-linux-focal-linter) # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627. @@ -328,6 +339,7 @@ docker build \ --build-arg "UCX_COMMIT=${UCX_COMMIT}" \ --build-arg "UCC_COMMIT=${UCC_COMMIT}" \ --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \ + --build-arg "TRITON=${TRITON}" \ -f $(dirname ${DOCKERFILE})/Dockerfile \ -t "$tmp_tag" \ "$@" \ diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt new file mode 100644 index 000000000000..d3ca0816018a --- /dev/null +++ b/.ci/docker/ci_commit_pins/triton.txt @@ -0,0 +1 @@ +b8b470bc597c1c5bd03682c09fe3e6b7c53787fd diff --git a/.ci/docker/common/common_utils.sh b/.ci/docker/common/common_utils.sh index 74c398397798..27c1b815a0ea 100644 --- a/.ci/docker/common/common_utils.sh +++ b/.ci/docker/common/common_utils.sh @@ -13,7 +13,7 @@ as_jenkins() { # NB: Pass on PATH and LD_LIBRARY_PATH to sudo invocation # NB: This must be run from a directory that jenkins has access to, # works around https://github.com/conda/conda-package-handling/pull/34 - $SUDO -H -u jenkins env -u SUDO_UID -u SUDO_GID -u SUDO_COMMAND -u SUDO_USER env "PATH=$PATH" "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" $* + $SUDO -E -H -u jenkins env -u SUDO_UID -u SUDO_GID -u SUDO_COMMAND -u SUDO_USER env "PATH=$PATH" "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" $* } conda_install() { @@ -30,3 +30,7 @@ conda_run() { pip_install() { as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION pip install --progress-bar off $* } + +get_pinned_commit() { + cat "${1}".txt +} diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh new file mode 100755 index 000000000000..4926b817bd2f --- /dev/null +++ b/.ci/docker/common/install_triton.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +set -ex + +source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" + +get_conda_version() { + as_jenkins conda list -n py_$ANACONDA_PYTHON_VERSION | grep -w $* | head -n 1 | awk '{print $2}' +} + +conda_reinstall() { + as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $* +} + +# The logic here is copied from .ci/pytorch/common_utils.sh +TRITON_PINNED_COMMIT=$(get_pinned_commit triton) + +apt update +apt-get install -y gpg-agent + +if [ -n "${CONDA_CMAKE}" ]; then + # Keep the current cmake and numpy version here, so we can reinstall them later + CMAKE_VERSION=$(get_conda_version cmake) + NUMPY_VERSION=$(get_conda_version numpy) +fi + +if [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}" == "7" ]]; then + # Triton needs at least gcc-9 to build + apt-get install -y g++-9 + + CXX=g++-9 pip_install "git+https://github.com/openai/triton@${TRITON_PINNED_COMMIT}#subdirectory=python" +elif [ -n "${CLANG_VERSION}" ]; then + # Triton needs which surprisingly is not available with clang-9 toolchain + add-apt-repository -y ppa:ubuntu-toolchain-r/test + apt-get install -y g++-9 + + CXX=g++-9 pip_install "git+https://github.com/openai/triton@${TRITON_PINNED_COMMIT}#subdirectory=python" +else + pip_install "git+https://github.com/openai/triton@${TRITON_PINNED_COMMIT}#subdirectory=python" +fi + +if [ -n "${CONDA_CMAKE}" ]; then + # TODO: This is to make sure that the same cmake and numpy version from install conda + # script is used. Without this step, the newer cmake version (3.25.2) downloaded by + # triton build step via pip will fail to detect conda MKL. Once that issue is fixed, + # this can be removed. + # + # The correct numpy version also needs to be set here because conda claims that it + # causes inconsistent environment. Without this, conda will attempt to install the + # latest numpy version, which fails ASAN tests with the following import error: Numba + # needs NumPy 1.20 or less. + conda_reinstall cmake="${CMAKE_VERSION}" + conda_reinstall numpy="${NUMPY_VERSION}" +fi diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index f3b5a0a85126..2196c92fe99a 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -258,3 +258,8 @@ ghstack==0.7.1 #Description: ghstack tool #Pinned versions: 0.7.1 #test that import: + +jinja2==3.1.2 +#Description: jinja2 template engine +#Pinned versions: 3.1.2 +#test that import: diff --git a/.ci/docker/ubuntu-cuda/Dockerfile b/.ci/docker/ubuntu-cuda/Dockerfile index 7784427eaa75..0e294838f90f 100644 --- a/.ci/docker/ubuntu-cuda/Dockerfile +++ b/.ci/docker/ubuntu-cuda/Dockerfile @@ -85,6 +85,15 @@ COPY ./common/install_cmake.sh install_cmake.sh RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi RUN rm install_cmake.sh +ARG TRITON +# Install triton, this needs to be done before sccache because the latter will +# try to reach out to S3, which docker build runners don't have access +COPY ./common/install_triton.sh install_triton.sh +COPY ./common/common_utils.sh common_utils.sh +COPY ci_commit_pins/triton.txt triton.txt +RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi +RUN rm install_triton.sh common_utils.sh triton.txt + # Install ccache/sccache (do this last, so we get priority in PATH) COPY ./common/install_cache.sh install_cache.sh ENV PATH /opt/cache/bin:$PATH diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile index 60a17c1d3e36..fd0e3a4fdfba 100644 --- a/.ci/docker/ubuntu/Dockerfile +++ b/.ci/docker/ubuntu/Dockerfile @@ -134,6 +134,15 @@ ENV OPENSSL_ROOT_DIR /opt/openssl ENV OPENSSL_DIR /opt/openssl RUN rm install_openssl.sh +ARG TRITON +# Install triton, this needs to be done before sccache because the latter will +# try to reach out to S3, which docker build runners don't have access +COPY ./common/install_triton.sh install_triton.sh +COPY ./common/common_utils.sh common_utils.sh +COPY ci_commit_pins/triton.txt triton.txt +RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi +RUN rm install_triton.sh common_utils.sh triton.txt + # Install ccache/sccache (do this last, so we get priority in PATH) COPY ./common/install_cache.sh install_cache.sh ENV PATH /opt/cache/bin:$PATH diff --git a/.github/ci_commit_pins/triton.txt b/.github/ci_commit_pins/triton.txt deleted file mode 100644 index d3ca0816018a..000000000000 --- a/.github/ci_commit_pins/triton.txt +++ /dev/null @@ -1 +0,0 @@ -b8b470bc597c1c5bd03682c09fe3e6b7c53787fd diff --git a/.github/ci_commit_pins/triton.txt b/.github/ci_commit_pins/triton.txt new file mode 120000 index 000000000000..7b62e01173b3 --- /dev/null +++ b/.github/ci_commit_pins/triton.txt @@ -0,0 +1 @@ +../../.ci/docker/ci_commit_pins/triton.txt \ No newline at end of file diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml index f59b5a68ba9a..29bb67a04f2f 100644 --- a/.github/workflows/build-triton-wheel.yml +++ b/.github/workflows/build-triton-wheel.yml @@ -9,11 +9,13 @@ on: - .github/workflows/build-triton-wheel.yml - .github/scripts/build_triton_wheel.py - .github/ci_commit_pins/triton.txt + - .ci/docker/ci_commit_pins/triton.txt pull_request: paths: - .github/workflows/build-triton-wheel.yml - .github/scripts/build_triton_wheel.py - .github/ci_commit_pins/triton.txt + - .ci/docker/ci_commit_pins/triton.txt concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}