mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[CI] Add initial ci test workflow for XPU based on IDC runners (#116554)
Add initial CI test for XPU based on IDC self-hosted runners with label `linux.idc.xpu`, which will be triggered by label `ciflow/xpu` for current stage. Works for RFC https://github.com/pytorch/pytorch/issues/114850 Pull Request resolved: https://github.com/pytorch/pytorch/pull/116554 Approved by: https://github.com/EikanWang, https://github.com/atalman
This commit is contained in:
committed by
PyTorch MergeBot
parent
6784030df4
commit
b6962208b8
@ -128,6 +128,8 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm* ]]; then
|
||||
# mainly used so that we're not spending extra cycles testing cpu
|
||||
# devices on expensive gpu machines
|
||||
export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"
|
||||
elif [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
|
||||
export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu"
|
||||
fi
|
||||
|
||||
if [[ "$TEST_CONFIG" == *crossref* ]]; then
|
||||
@ -140,6 +142,15 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
|
||||
rocminfo | grep -E 'Name:.*\sgfx|Marketing'
|
||||
fi
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
|
||||
# Source Intel oneAPI envrioment script to enable xpu runtime related libraries
|
||||
# refer to https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/2024-0/use-the-setvars-and-oneapi-vars-scripts-with-linux.html
|
||||
# shellcheck disable=SC1091
|
||||
source /opt/intel/oneapi/compiler/latest/env/vars.sh
|
||||
# Check XPU status before testing
|
||||
xpu-smi discovery
|
||||
fi
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
|
||||
# JIT C++ extensions require ninja.
|
||||
pip_install --user "ninja==1.10.2"
|
||||
@ -678,6 +689,20 @@ test_libtorch_api() {
|
||||
fi
|
||||
}
|
||||
|
||||
test_xpu_bin(){
|
||||
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
||||
mkdir -p "$TEST_REPORTS_DIR"
|
||||
|
||||
for xpu_case in "${BUILD_BIN_DIR}"/*{xpu,sycl}*
|
||||
do
|
||||
if [[ "$xpu_case" != *"*"* ]]; then
|
||||
case_name=$(basename "$xpu_case")
|
||||
echo "Testing ${case_name} ..."
|
||||
"$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
test_aot_compilation() {
|
||||
echo "Testing Ahead of Time compilation"
|
||||
ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
|
||||
@ -1115,6 +1140,9 @@ elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
|
||||
test_python_shard 1
|
||||
test_aten
|
||||
test_libtorch 1
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
|
||||
test_xpu_bin
|
||||
fi
|
||||
elif [[ "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
|
||||
install_torchvision
|
||||
test_python_shard 2
|
||||
@ -1139,10 +1167,11 @@ elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
|
||||
install_torchvision
|
||||
test_python
|
||||
test_aten
|
||||
elif [[ "${BUILD_ENVIRONMENT}" == *xpu* && -n "$TESTS_TO_INCLUDE" ]]; then
|
||||
elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
|
||||
install_torchvision
|
||||
test_python
|
||||
test_aten
|
||||
test_xpu_bin
|
||||
else
|
||||
install_torchvision
|
||||
install_monkeytype
|
||||
|
67
.github/actions/setup-xpu/action.yml
vendored
Normal file
67
.github/actions/setup-xpu/action.yml
vendored
Normal file
@ -0,0 +1,67 @@
|
||||
name: Setup XPU host
|
||||
|
||||
description: Set up XPU host for CI
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Clean all stopped docker containers
|
||||
if: always()
|
||||
shell: bash
|
||||
run: |
|
||||
# Prune all stopped containers.
|
||||
# If other runner is pruning on this node, will skip.
|
||||
nprune=$(ps -ef | grep -c "docker container prune")
|
||||
if [[ $nprune -eq 1 ]]; then
|
||||
docker container prune -f
|
||||
fi
|
||||
|
||||
- name: Runner health check system info
|
||||
if: always()
|
||||
shell: bash
|
||||
run: |
|
||||
cat /etc/os-release || true
|
||||
cat /etc/apt/sources.list.d/oneAPI.list || true
|
||||
cat /etc/apt/sources.list.d/intel-gpu-jammy.list || true
|
||||
whoami
|
||||
|
||||
- name: Runner health check xpu-smi
|
||||
if: always()
|
||||
shell: bash
|
||||
run: |
|
||||
xpu-smi discovery
|
||||
|
||||
- name: Runner health check GPU count
|
||||
if: always()
|
||||
shell: bash
|
||||
run: |
|
||||
ngpu=$(xpu-smi discovery | grep -c -E 'Device Name')
|
||||
msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
|
||||
if [[ $ngpu -eq 0 ]]; then
|
||||
echo "Error: Failed to detect any GPUs on the runner"
|
||||
echo "$msg"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Runner diskspace health check
|
||||
uses: ./.github/actions/diskspace-cleanup
|
||||
if: always()
|
||||
|
||||
- name: Runner health check disconnect on failure
|
||||
if: ${{ failure() }}
|
||||
shell: bash
|
||||
run: |
|
||||
killall runsvc.sh
|
||||
|
||||
- name: Preserve github env variables for use in docker
|
||||
shell: bash
|
||||
run: |
|
||||
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
|
||||
- name: XPU set GPU_FLAG
|
||||
shell: bash
|
||||
run: |
|
||||
# Add render group for container creation.
|
||||
render_gid=`cat /etc/group | grep render | cut -d: -f3`
|
||||
echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}"
|
20
.github/actions/teardown-xpu/action.yml
vendored
Normal file
20
.github/actions/teardown-xpu/action.yml
vendored
Normal file
@ -0,0 +1,20 @@
|
||||
name: Teardown XPU host
|
||||
|
||||
description: Tear down XPU host for CI
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Teardown XPU
|
||||
if: always()
|
||||
shell: bash
|
||||
run: |
|
||||
# Prune all stopped containers.
|
||||
# If other runner is pruning on this node, will skip.
|
||||
nprune=$(ps -ef | grep -c "docker container prune")
|
||||
if [[ $nprune -eq 1 ]]; then
|
||||
docker container prune -f
|
||||
fi
|
||||
- name: Runner diskspace health check
|
||||
uses: ./.github/actions/diskspace-cleanup
|
||||
if: always()
|
269
.github/workflows/_xpu-test.yml
vendored
Normal file
269
.github/workflows/_xpu-test.yml
vendored
Normal file
@ -0,0 +1,269 @@
|
||||
# TODO: this looks sort of similar to _linux-test, but there are like a dozen
|
||||
# places where you would have to insert an if statement. Probably it's better to
|
||||
# just use a different workflow altogether
|
||||
|
||||
name: xpu-test
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
build-environment:
|
||||
required: true
|
||||
type: string
|
||||
description: Top-level label for what's being built/tested.
|
||||
test-matrix:
|
||||
required: true
|
||||
type: string
|
||||
description: JSON description of what test configs to run.
|
||||
docker-image:
|
||||
required: true
|
||||
type: string
|
||||
description: Docker image to run in.
|
||||
sync-tag:
|
||||
required: false
|
||||
type: string
|
||||
default: ""
|
||||
description: |
|
||||
If this is set, our linter will use this to make sure that every other
|
||||
job with the same `sync-tag` is identical.
|
||||
timeout-minutes:
|
||||
required: false
|
||||
type: number
|
||||
default: 300
|
||||
description: |
|
||||
Set the maximum (in minutes) how long the workflow should take to finish
|
||||
tests-to-include:
|
||||
required: false
|
||||
type: string
|
||||
default: ""
|
||||
description: |
|
||||
List of tests to include (empty string implies default list)
|
||||
|
||||
env:
|
||||
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
test:
|
||||
# Don't run on forked repos or empty test matrix
|
||||
if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
|
||||
strategy:
|
||||
matrix: ${{ fromJSON(inputs.test-matrix) }}
|
||||
fail-fast: false
|
||||
timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
|
||||
runs-on: ${{ matrix.runner }}
|
||||
steps:
|
||||
# [see note: pytorch repo ref]
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
||||
|
||||
- name: Setup XPU
|
||||
uses: ./.github/actions/setup-xpu
|
||||
|
||||
- name: configure aws credentials
|
||||
id: aws_creds
|
||||
uses: aws-actions/configure-aws-credentials@v1.7.0
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_pytorch_artifacts
|
||||
aws-region: us-east-1
|
||||
|
||||
- name: Login to Amazon ECR
|
||||
id: login-ecr
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
|
||||
- name: Calculate docker image
|
||||
id: calculate-docker-image
|
||||
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
||||
with:
|
||||
docker-image-name: ${{ inputs.docker-image }}
|
||||
|
||||
- name: Pull docker image
|
||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
||||
with:
|
||||
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
|
||||
- name: Start monitoring script
|
||||
id: monitor-script
|
||||
shell: bash
|
||||
continue-on-error: true
|
||||
run: |
|
||||
python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
|
||||
python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
|
||||
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Download build artifacts
|
||||
uses: ./.github/actions/download-build-artifacts
|
||||
with:
|
||||
name: ${{ inputs.build-environment }}
|
||||
|
||||
- name: Parse ref
|
||||
id: parse-ref
|
||||
run: .github/scripts/parse_ref.py
|
||||
|
||||
- name: Get workflow job id
|
||||
id: get-job-id
|
||||
uses: ./.github/actions/get-workflow-job-id
|
||||
if: always()
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Check for keep-going label and re-enabled test issues
|
||||
# This uses the filter-test-configs action because it conviniently
|
||||
# checks for labels and re-enabled test issues. It does not actually do
|
||||
# any filtering. All filtering is done in the build step.
|
||||
id: keep-going
|
||||
uses: ./.github/actions/filter-test-configs
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
test-matrix: ${{ inputs.test-matrix }}
|
||||
job-name: ${{ steps.get-job-id.outputs.job-name }}
|
||||
|
||||
- name: Set Test step time
|
||||
id: test-timeout
|
||||
shell: bash
|
||||
env:
|
||||
JOB_TIMEOUT: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
|
||||
run: |
|
||||
echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Test
|
||||
id: test
|
||||
env:
|
||||
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
GITHUB_REPOSITORY: ${{ github.repository }}
|
||||
GITHUB_WORKFLOW: ${{ github.workflow }}
|
||||
GITHUB_JOB: ${{ github.job }}
|
||||
GITHUB_RUN_ID: ${{ github.run_id }}
|
||||
GITHUB_RUN_NUMBER: ${{ github.run_number }}
|
||||
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
|
||||
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
||||
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
|
||||
BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
PYTORCH_RETRY_TEST_CASES: 1
|
||||
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
|
||||
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
|
||||
TEST_CONFIG: ${{ matrix.config }}
|
||||
SHARD_NUMBER: ${{ matrix.shard }}
|
||||
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
|
||||
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
|
||||
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
|
||||
DOCKER_IMAGE: ${{ inputs.docker-image }}
|
||||
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
|
||||
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
|
||||
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
|
||||
TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
|
||||
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
|
||||
run: |
|
||||
set -x
|
||||
|
||||
TEST_COMMAND=.ci/pytorch/test.sh
|
||||
|
||||
# detached container should get cleaned up by teardown_ec2_linux
|
||||
# Used for GPU_FLAG since that doesn't play nice
|
||||
# shellcheck disable=SC2086,SC2090
|
||||
container_name=$(docker run \
|
||||
${GPU_FLAG:-} \
|
||||
-e BUILD_ENVIRONMENT \
|
||||
-e PR_NUMBER \
|
||||
-e GITHUB_ACTIONS \
|
||||
-e GITHUB_REPOSITORY \
|
||||
-e GITHUB_WORKFLOW \
|
||||
-e GITHUB_JOB \
|
||||
-e GITHUB_RUN_ID \
|
||||
-e GITHUB_RUN_NUMBER \
|
||||
-e GITHUB_RUN_ATTEMPT \
|
||||
-e JOB_ID \
|
||||
-e BRANCH \
|
||||
-e SHA1 \
|
||||
-e AWS_DEFAULT_REGION \
|
||||
-e IN_WHEEL_TEST \
|
||||
-e SHARD_NUMBER \
|
||||
-e TEST_CONFIG \
|
||||
-e NUM_TEST_SHARDS \
|
||||
-e REENABLED_ISSUES \
|
||||
-e PYTORCH_RETRY_TEST_CASES \
|
||||
-e PYTORCH_OVERRIDE_FLAKY_SIGNAL \
|
||||
-e CONTINUE_THROUGH_ERROR \
|
||||
-e MAX_JOBS="$(nproc --ignore=2)" \
|
||||
-e SCCACHE_BUCKET \
|
||||
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
|
||||
-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
|
||||
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
|
||||
-e TESTS_TO_INCLUDE \
|
||||
-e ZE_AFFINITY_MASK \
|
||||
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
||||
--ulimit stack=10485760:83886080 \
|
||||
--ulimit core=0 \
|
||||
--security-opt seccomp=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--shm-size="8g" \
|
||||
--tty \
|
||||
--detach \
|
||||
--name="${container_name}" \
|
||||
--user jenkins \
|
||||
--privileged \
|
||||
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
||||
-w /var/lib/jenkins/workspace \
|
||||
"${DOCKER_IMAGE}"
|
||||
)
|
||||
# save container name for later step
|
||||
echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
|
||||
# jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
|
||||
docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}"
|
||||
|
||||
- name: Save test results
|
||||
if: always()
|
||||
run: |
|
||||
# copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
|
||||
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"
|
||||
|
||||
- name: Print remaining test logs
|
||||
shell: bash
|
||||
if: always() && steps.test.conclusion
|
||||
run: |
|
||||
cat test/**/*_toprint.log || true
|
||||
|
||||
- name: Stop monitoring script
|
||||
if: always() && steps.monitor-script.outputs.monitor-script-pid
|
||||
shell: bash
|
||||
continue-on-error: true
|
||||
env:
|
||||
MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
|
||||
run: |
|
||||
kill "$MONITOR_SCRIPT_PID"
|
||||
|
||||
- name: Upload test artifacts
|
||||
uses: ./.github/actions/upload-test-artifacts
|
||||
if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
|
||||
with:
|
||||
use-gha: true
|
||||
file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
|
||||
|
||||
- name: Collect backtraces from coredumps (if any)
|
||||
if: always()
|
||||
run: |
|
||||
# shellcheck disable=SC2156
|
||||
find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
|
||||
|
||||
- name: Stop container before exit
|
||||
if: always()
|
||||
run: |
|
||||
# Workaround for multiple runners on same IDC node
|
||||
docker stop "${{ env.CONTAINER_NAME }}"
|
||||
|
||||
- name: Store Core dumps on GitHub
|
||||
uses: actions/upload-artifact@v3
|
||||
if: failure()
|
||||
with:
|
||||
name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
|
||||
retention-days: 14
|
||||
if-no-files-found: ignore
|
||||
path: ./**/core.[1-9]*
|
||||
|
||||
- name: Teardown XPU
|
||||
uses: ./.github/actions/teardown-xpu
|
14
.github/workflows/xpu.yml
vendored
14
.github/workflows/xpu.yml
vendored
@ -20,5 +20,17 @@ jobs:
|
||||
runner: linux.2xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 1, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 2, num_shards: 4, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 3, num_shards: 4, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 4, num_shards: 4, runner: "linux.idc.xpu" },
|
||||
]}
|
||||
|
||||
linux-jammy-xpu-py3_8-test:
|
||||
name: linux-jammy-xpu-py3.8
|
||||
uses: ./.github/workflows/_xpu-test.yml
|
||||
needs: linux-jammy-xpu-py3_8-build
|
||||
with:
|
||||
build-environment: linux-jammy-xpu-py3.8
|
||||
docker-image: ${{ needs.linux-jammy-xpu-py3_8-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-xpu-py3_8-build.outputs.test-matrix }}
|
||||
|
Reference in New Issue
Block a user