mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165573 Approved by: https://github.com/malfet ghstack dependencies: #165560
1856 lines
75 KiB
Bash
Executable File
1856 lines
75 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# Required environment variable: $BUILD_ENVIRONMENT
|
|
# (This is set by default in the Docker images we build, so you don't
|
|
# need to set it yourself.
|
|
|
|
set -ex -o pipefail
|
|
|
|
# Suppress ANSI color escape sequences
|
|
export TERM=vt100
|
|
|
|
# shellcheck source=./common.sh
|
|
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
|
|
# shellcheck source=./common-build.sh
|
|
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
|
|
|
|
# Do not change workspace permissions for ROCm and s390x CI jobs
|
|
# as it can leave workspace with bad permissions for cancelled jobs
|
|
if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then
|
|
# Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
|
|
WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
|
|
cleanup_workspace() {
|
|
echo "sudo may print the following warning message that can be ignored. The chown command will still run."
|
|
echo " sudo: setrlimit(RLIMIT_STACK): Operation not permitted"
|
|
echo "For more details refer to https://github.com/sudo-project/sudo/issues/42"
|
|
sudo chown -R "$WORKSPACE_ORIGINAL_OWNER_ID" /var/lib/jenkins/workspace
|
|
}
|
|
# Disable shellcheck SC2064 as we want to parse the original owner immediately.
|
|
# shellcheck disable=SC2064
|
|
trap_add cleanup_workspace EXIT
|
|
sudo chown -R jenkins /var/lib/jenkins/workspace
|
|
git config --global --add safe.directory /var/lib/jenkins/workspace
|
|
fi
|
|
|
|
|
|
# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
|
|
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
|
|
NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
|
|
if [ -n "$NUMBA_CUDA_DIR" ]; then
|
|
NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
|
|
pushd "$NUMBA_CUDA_DIR"
|
|
patch -p4 <"$NUMBA_PATCH"
|
|
popd
|
|
fi
|
|
fi
|
|
|
|
echo "Environment variables:"
|
|
env
|
|
|
|
TORCH_INSTALL_DIR=$(python -c "import site; print(site.getsitepackages()[0])")/torch
|
|
TORCH_BIN_DIR="$TORCH_INSTALL_DIR"/bin
|
|
TORCH_LIB_DIR="$TORCH_INSTALL_DIR"/lib
|
|
TORCH_TEST_DIR="$TORCH_INSTALL_DIR"/test
|
|
|
|
BUILD_DIR="build"
|
|
BUILD_RENAMED_DIR="build_renamed"
|
|
BUILD_BIN_DIR="$BUILD_DIR"/bin
|
|
|
|
#Set Default values for these variables in case they are not set
|
|
SHARD_NUMBER="${SHARD_NUMBER:=1}"
|
|
NUM_TEST_SHARDS="${NUM_TEST_SHARDS:=1}"
|
|
|
|
# enable debug asserts in serialization
|
|
export TORCH_SERIALIZATION_DEBUG=1
|
|
|
|
export VALGRIND=ON
|
|
# export TORCH_INDUCTOR_INSTALL_GXX=ON
|
|
if [[ "$BUILD_ENVIRONMENT" == *clang9* || "$BUILD_ENVIRONMENT" == *xpu* ]]; then
|
|
# clang9 appears to miscompile code involving std::optional<c10::SymInt>,
|
|
# such that valgrind complains along these lines:
|
|
#
|
|
# Conditional jump or move depends on uninitialised value(s)
|
|
# at 0x40303A: ~optional_base (Optional.h:281)
|
|
# by 0x40303A: call (Dispatcher.h:448)
|
|
# by 0x40303A: call(at::Tensor const&, c10::ArrayRef<c10::SymInt>, c10::ArrayRef<c10::SymInt>, std::optional<c10::SymInt>) (basic.cpp:10)
|
|
# by 0x403700: main (basic.cpp:16)
|
|
# Uninitialised value was created by a stack allocation
|
|
# at 0x402AAA: call(at::Tensor const&, c10::ArrayRef<c10::SymInt>, c10::ArrayRef<c10::SymInt>, std::optional<c10::SymInt>) (basic.cpp:6)
|
|
#
|
|
# The problem does not appear with gcc or newer versions of clang (we tested
|
|
# clang14). So we suppress valgrind testing for clang9 specifically.
|
|
# You may need to suppress it for other versions of clang if they still have
|
|
# the bug.
|
|
#
|
|
# A minimal repro for the valgrind error is below:
|
|
#
|
|
# #include <ATen/ATen.h>
|
|
# #include <ATen/core/dispatch/Dispatcher.h>
|
|
#
|
|
# using namespace at;
|
|
#
|
|
# Tensor call(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, std::optional<c10::SymInt> storage_offset) {
|
|
# auto op = c10::Dispatcher::singleton()
|
|
# .findSchemaOrThrow(at::_ops::as_strided::name, at::_ops::as_strided::overload_name)
|
|
# .typed<at::_ops::as_strided::schema>();
|
|
# return op.call(self, size, stride, storage_offset);
|
|
# }
|
|
#
|
|
# int main(int argv) {
|
|
# Tensor b = empty({3, 4});
|
|
# auto z = call(b, b.sym_sizes(), b.sym_strides(), std::nullopt);
|
|
# }
|
|
export VALGRIND=OFF
|
|
fi
|
|
|
|
detect_cuda_arch
|
|
|
|
if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then
|
|
# There are additional warnings on s390x, maybe due to newer gcc.
|
|
# Skip this check for now
|
|
export VALGRIND=OFF
|
|
fi
|
|
|
|
if [[ "${PYTORCH_TEST_RERUN_DISABLED_TESTS}" == "1" ]] || [[ "${CONTINUE_THROUGH_ERROR}" == "1" ]]; then
|
|
# When rerunning disable tests, do not generate core dumps as it could consume
|
|
# the runner disk space when crashed tests are run multiple times. Running out
|
|
# of space is a nasty issue because there is no space left to even download the
|
|
# GHA to clean up the disk
|
|
#
|
|
# We also want to turn off core dump when CONTINUE_THROUGH_ERROR is set as there
|
|
# is a small risk of having multiple core files generated. Arguably, they are not
|
|
# that useful in this case anyway and the test will still continue
|
|
ulimit -c 0
|
|
|
|
# Note that by piping the core dump to a script set in /proc/sys/kernel/core_pattern
|
|
# as documented in https://man7.org/linux/man-pages/man5/core.5.html, we could
|
|
# dynamically stop generating more core file when the disk space drops below a
|
|
# certain threshold. However, this is not supported inside Docker container atm
|
|
fi
|
|
|
|
# Get fully qualified path using realpath
|
|
if [[ "$BUILD_ENVIRONMENT" != *bazel* ]]; then
|
|
CUSTOM_TEST_ARTIFACT_BUILD_DIR=$(realpath "${CUSTOM_TEST_ARTIFACT_BUILD_DIR:-"build/custom_test_artifacts"}")
|
|
fi
|
|
|
|
# Reduce set of tests to include when running run_test.py
|
|
if [[ -n $TESTS_TO_INCLUDE ]]; then
|
|
echo "Setting INCLUDE_CLAUSE"
|
|
INCLUDE_CLAUSE="--include $TESTS_TO_INCLUDE"
|
|
fi
|
|
|
|
echo "Environment variables"
|
|
env
|
|
|
|
echo "Testing pytorch"
|
|
|
|
export LANG=C.UTF-8
|
|
|
|
PR_NUMBER=${PR_NUMBER:-${CIRCLE_PR_NUMBER:-}}
|
|
|
|
if [[ "$TEST_CONFIG" == 'default' ]]; then
|
|
export CUDA_VISIBLE_DEVICES=0
|
|
export HIP_VISIBLE_DEVICES=0
|
|
fi
|
|
|
|
if [[ "$TEST_CONFIG" == 'distributed' ]] && [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
|
|
export HIP_VISIBLE_DEVICES=0,1,2,3
|
|
fi
|
|
|
|
if [[ "$TEST_CONFIG" == 'slow' ]]; then
|
|
export PYTORCH_TEST_WITH_SLOW=1
|
|
export PYTORCH_TEST_SKIP_FAST=1
|
|
fi
|
|
|
|
if [[ "$BUILD_ENVIRONMENT" == *slow-gradcheck* ]]; then
|
|
export PYTORCH_TEST_WITH_SLOW_GRADCHECK=1
|
|
# TODO: slow gradcheck tests run out of memory a lot recently, so setting this
|
|
# to run them sequentially with only one process to mitigate the issue
|
|
export PYTORCH_TEST_CUDA_MEM_LEAK_CHECK=1
|
|
fi
|
|
|
|
if [[ "$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm* ]]; then
|
|
# Used so that only cuda/rocm specific versions of tests are generated
|
|
# mainly used so that we're not spending extra cycles testing cpu
|
|
# devices on expensive gpu machines
|
|
export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"
|
|
elif [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
|
|
export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu"
|
|
# setting PYTHON_TEST_EXTRA_OPTION
|
|
export PYTHON_TEST_EXTRA_OPTION="--xpu"
|
|
fi
|
|
|
|
if [[ "$TEST_CONFIG" == *crossref* ]]; then
|
|
export PYTORCH_TEST_WITH_CROSSREF=1
|
|
fi
|
|
|
|
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
|
|
# regression in ROCm 6.0 on MI50 CI runners due to hipblaslt; remove in 6.1
|
|
export VALGRIND=OFF
|
|
# Print GPU info
|
|
rocminfo
|
|
rocminfo | grep -E 'Name:.*\sgfx|Marketing'
|
|
|
|
# for benchmarks/dynamo/check_accuracy.py, we need to put results in a rocm specific directory to avoid clashes with cuda
|
|
MAYBE_ROCM="rocm/"
|
|
fi
|
|
|
|
if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
|
|
# Source Intel oneAPI envrioment script to enable xpu runtime related libraries
|
|
# refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
|
|
# shellcheck disable=SC1091
|
|
source /opt/intel/oneapi/compiler/latest/env/vars.sh
|
|
if [ -f /opt/intel/oneapi/umf/latest/env/vars.sh ]; then
|
|
# shellcheck disable=SC1091
|
|
source /opt/intel/oneapi/umf/latest/env/vars.sh
|
|
fi
|
|
# shellcheck disable=SC1091
|
|
source /opt/intel/oneapi/ccl/latest/env/vars.sh
|
|
# shellcheck disable=SC1091
|
|
source /opt/intel/oneapi/mpi/latest/env/vars.sh
|
|
# Check XPU status before testing
|
|
timeout 30 xpu-smi discovery || true
|
|
fi
|
|
|
|
if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
|
|
# JIT C++ extensions require ninja.
|
|
pip_install "ninja==1.10.2"
|
|
# ninja is installed in $HOME/.local/bin, e.g., /var/lib/jenkins/.local/bin for CI user jenkins
|
|
# but this script should be runnable by any user, including root
|
|
export PATH="$HOME/.local/bin:$PATH"
|
|
fi
|
|
|
|
if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
|
|
# TODO: revisit this once the CI is stabilized on aarch64 linux
|
|
export VALGRIND=OFF
|
|
fi
|
|
|
|
# DANGER WILL ROBINSON. The LD_PRELOAD here could cause you problems
|
|
# if you're not careful. Check this if you made some changes and the
|
|
# ASAN test is not working
|
|
if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
|
|
export ASAN_OPTIONS=detect_leaks=0:symbolize=1:detect_stack_use_after_return=true:strict_init_order=true:detect_odr_violation=1:detect_container_overflow=0:check_initialization_order=true:debug=true
|
|
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
|
|
export ASAN_OPTIONS="${ASAN_OPTIONS}:protect_shadow_gap=0"
|
|
fi
|
|
export UBSAN_OPTIONS=print_stacktrace=1:suppressions=$PWD/ubsan.supp
|
|
export PYTORCH_TEST_WITH_ASAN=1
|
|
export PYTORCH_TEST_WITH_UBSAN=1
|
|
# TODO: Figure out how to avoid hard-coding these paths
|
|
export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-18/bin/llvm-symbolizer
|
|
export TORCH_USE_RTLD_GLOBAL=1
|
|
# NB: We load libtorch.so with RTLD_GLOBAL for UBSAN, unlike our
|
|
# default behavior.
|
|
#
|
|
# The reason for this is that without RTLD_GLOBAL, if we load multiple
|
|
# libraries that depend on libtorch (as is the case with C++ extensions), we
|
|
# will get multiple copies of libtorch in our address space. When UBSAN is
|
|
# turned on, it will do a bunch of virtual pointer consistency checks which
|
|
# won't work correctly. When this happens, you get a violation like:
|
|
#
|
|
# member call on address XXXXXX which does not point to an object of
|
|
# type 'std::_Sp_counted_base<__gnu_cxx::_Lock_policy::_S_atomic>'
|
|
# XXXXXX note: object is of type
|
|
# 'std::_Sp_counted_ptr<torch::nn::LinearImpl*, (__gnu_cxx::_Lock_policy)2>'
|
|
#
|
|
# (NB: the textual types of the objects here are misleading, because
|
|
# they actually line up; it just so happens that there's two copies
|
|
# of the type info floating around in the address space, so they
|
|
# don't pointer compare equal. See also
|
|
# https://github.com/google/sanitizers/issues/1175
|
|
#
|
|
# UBSAN is kind of right here: if we relied on RTTI across C++ extension
|
|
# modules they would indeed do the wrong thing; but in our codebase, we
|
|
# don't use RTTI (because it doesn't work in mobile). To appease
|
|
# UBSAN, however, it's better if we ensure all the copies agree!
|
|
#
|
|
# By the way, an earlier version of this code attempted to load
|
|
# libtorch_python.so with LD_PRELOAD, which has a similar effect of causing
|
|
# it to be loaded globally. This isn't really a good idea though, because
|
|
# it depends on a ton of dynamic libraries that most programs aren't gonna
|
|
# have, and it applies to child processes.
|
|
|
|
LD_PRELOAD=$(clang --print-file-name=libclang_rt.asan-x86_64.so)
|
|
export LD_PRELOAD
|
|
# Disable valgrind for asan
|
|
export VALGRIND=OFF
|
|
|
|
(cd test && python -c "import torch; print(torch.__version__, torch.version.git_version)")
|
|
echo "The next four invocations are expected to crash; if they don't that means ASAN/UBSAN is misconfigured"
|
|
(cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_csrc_asan(3)")
|
|
#(cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_csrc_ubsan(0)")
|
|
(cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_vptr_ubsan()")
|
|
(cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_aten_asan(3)")
|
|
fi
|
|
|
|
# The torch._C._crash_if_debug_asserts_fail() function should only fail if both of the following are true:
|
|
# 1. The build is in debug mode
|
|
# 2. The value 424242 is passed in
|
|
# This tests that the debug asserts are working correctly.
|
|
if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
|
|
echo "We are in debug mode: $BUILD_ENVIRONMENT. Expect the python assertion to fail"
|
|
(cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_debug_asserts_fail(424242)")
|
|
elif [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
|
|
# Noop when debug is disabled. Skip bazel jobs because torch isn't available there yet.
|
|
echo "We are not in debug mode: $BUILD_ENVIRONMENT. Expect the assertion to pass"
|
|
(cd test && python -c "import torch; torch._C._crash_if_debug_asserts_fail(424242)")
|
|
fi
|
|
|
|
if [[ $TEST_CONFIG == 'nogpu_NO_AVX2' ]]; then
|
|
export ATEN_CPU_CAPABILITY=default
|
|
elif [[ $TEST_CONFIG == 'nogpu_AVX512' ]]; then
|
|
export ATEN_CPU_CAPABILITY=avx2
|
|
fi
|
|
|
|
if [[ "${TEST_CONFIG}" == "legacy_nvidia_driver" ]]; then
|
|
# Make sure that CUDA can be initialized
|
|
(cd test && python -c "import torch; torch.rand(2, 2, device='cuda')")
|
|
export USE_LEGACY_DRIVER=1
|
|
fi
|
|
|
|
test_python_legacy_jit() {
|
|
time python test/run_test.py --include test_jit_legacy test_jit_fuser_legacy --verbose
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_python_shard() {
|
|
if [[ -z "$NUM_TEST_SHARDS" ]]; then
|
|
echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
|
|
exit 1
|
|
fi
|
|
|
|
# Bare --include flag is not supported and quoting for lint ends up with flag not being interpreted correctly
|
|
# shellcheck disable=SC2086
|
|
|
|
# modify LD_LIBRARY_PATH to ensure it has the conda env.
|
|
# This set of tests has been shown to be buggy without it for the split-build
|
|
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
|
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_python() {
|
|
# shellcheck disable=SC2086
|
|
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_python_smoke() {
|
|
# Smoke tests for H100/B200
|
|
time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_python_smoke_b200() {
|
|
# Targeted smoke tests for B200 - staged approach to avoid too many failures
|
|
time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_h100_distributed() {
|
|
# Distributed tests at H100
|
|
time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
|
# This test requires multicast support
|
|
time python test/run_test.py --include distributed/_composable/fsdp/test_fully_shard_comm.py -k TestFullyShardAllocFromPG $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_h100_symm_mem() {
|
|
# symmetric memory test
|
|
time python test/run_test.py --include distributed/test_symmetric_memory.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
|
time python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
|
time python test/run_test.py --include distributed/test_nvshmem_triton.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
|
time python test/run_test.py --include distributed/test_nccl.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_h100_cutlass_backend() {
|
|
# cutlass backend tests for H100
|
|
TORCHINDUCTOR_CUTLASS_DIR=$(realpath "./third_party/cutlass") python test/run_test.py --include inductor/test_cutlass_backend -k "not addmm" $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
|
TORCHINDUCTOR_CUTLASS_DIR=$(realpath "./third_party/cutlass") python test/run_test.py --include inductor/test_cutlass_evt $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
|
}
|
|
|
|
test_lazy_tensor_meta_reference_disabled() {
|
|
export TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE=1
|
|
echo "Testing lazy tensor operations without meta reference"
|
|
time python test/run_test.py --include lazy/test_ts_opinfo.py --verbose
|
|
export -n TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE
|
|
}
|
|
|
|
|
|
test_dynamo_wrapped_shard() {
|
|
if [[ -z "$NUM_TEST_SHARDS" ]]; then
|
|
echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
|
|
exit 1
|
|
fi
|
|
python tools/dynamo/verify_dynamo.py
|
|
# PLEASE DO NOT ADD ADDITIONAL EXCLUDES HERE.
|
|
# Instead, use @skipIfTorchDynamo on your tests.
|
|
time python test/run_test.py --dynamo \
|
|
--exclude-inductor-tests \
|
|
--exclude-jit-executor \
|
|
--exclude-distributed-tests \
|
|
--exclude-torch-export-tests \
|
|
--exclude-aot-dispatch-tests \
|
|
--exclude-quantization-tests \
|
|
--shard "$1" "$NUM_TEST_SHARDS" \
|
|
--verbose \
|
|
--upload-artifacts-while-running
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_einops() {
|
|
pip install einops==0.6.1
|
|
time python test/run_test.py --einops --verbose --upload-artifacts-while-running
|
|
pip install einops==0.7.0
|
|
time python test/run_test.py --einops --verbose --upload-artifacts-while-running
|
|
pip install einops==0.8.1
|
|
time python test/run_test.py --einops --verbose --upload-artifacts-while-running
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
|
|
test_inductor_distributed() {
|
|
# Smuggle a few multi-gpu tests here so that we don't have to request another large node
|
|
echo "Testing multi_gpu tests in test_torchinductor"
|
|
python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
|
|
python test/run_test.py -i inductor/test_aot_inductor.py -k test_on_gpu_device1 --verbose
|
|
python test/run_test.py -i inductor/test_aot_inductor.py -k test_non_default_gpu_device --verbose
|
|
python test/run_test.py -i inductor/test_aot_inductor.py -k test_load_package_multiple_gpus --verbose
|
|
python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
|
|
python test/run_test.py -i distributed/tensor/test_dtensor_compile.py --verbose
|
|
python test/run_test.py -i distributed/tensor/parallel/test_micro_pipeline_tp.py --verbose
|
|
python test/run_test.py -i distributed/_composable/test_replicate_with_compiler.py --verbose
|
|
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
|
|
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
|
|
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
|
|
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp --verbose
|
|
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume --verbose
|
|
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation --verbose
|
|
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_state_dict.py -k test_dp_state_dict_save_load --verbose
|
|
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose
|
|
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
|
|
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
|
|
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py -k test_clip_grad_norm_2d --verbose
|
|
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_compile.py --verbose
|
|
python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose
|
|
|
|
# this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
|
|
# with if required # gpus aren't available
|
|
python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_aten_comm_compute_reordering distributed/test_compute_comm_reordering --verbose
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_inductor_shard() {
|
|
if [[ -z "$NUM_TEST_SHARDS" ]]; then
|
|
echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
|
|
exit 1
|
|
fi
|
|
|
|
python tools/dynamo/verify_dynamo.py
|
|
python test/run_test.py --inductor \
|
|
--include test_modules test_ops test_ops_gradients test_torch \
|
|
--shard "$1" "$NUM_TEST_SHARDS" \
|
|
--verbose
|
|
|
|
# Do not add --inductor for the following inductor unit tests, otherwise we will fail because of nested dynamo state
|
|
python test/run_test.py \
|
|
--include inductor/test_torchinductor inductor/test_torchinductor_opinfo inductor/test_aot_inductor \
|
|
--shard "$1" "$NUM_TEST_SHARDS" \
|
|
--verbose
|
|
}
|
|
|
|
test_inductor_aoti() {
|
|
# docker build uses bdist_wheel which does not work with test_aot_inductor
|
|
# TODO: need a faster way to build
|
|
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
|
|
# We need to hipify before building again
|
|
python3 tools/amd_build/build_amd.py
|
|
fi
|
|
if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
|
|
BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .)
|
|
# TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
|
|
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
|
|
else
|
|
BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .)
|
|
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
|
|
fi
|
|
|
|
# aoti cmake custom command requires `torch` to be installed
|
|
# initialize the cmake build cache and install torch
|
|
/usr/bin/env "${BUILD_COMMAND[@]}"
|
|
# rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
|
|
/usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
|
|
|
|
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
|
|
}
|
|
|
|
test_inductor_aoti_cross_compile_for_windows() {
|
|
|
|
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
|
mkdir -p "$TEST_REPORTS_DIR"
|
|
|
|
# Set WINDOWS_CUDA_HOME environment variable
|
|
WINDOWS_CUDA_HOME="$(pwd)/win-torch-wheel-extracted"
|
|
export WINDOWS_CUDA_HOME
|
|
|
|
echo "WINDOWS_CUDA_HOME is set to: $WINDOWS_CUDA_HOME"
|
|
echo "Contents:"
|
|
ls -lah "$(pwd)/win-torch-wheel-extracted/lib/x64/" || true
|
|
|
|
python test/inductor/test_aoti_cross_compile_windows.py -k compile --package-dir "$TEST_REPORTS_DIR" --win-torch-lib-dir "$(pwd)/win-torch-wheel-extracted/torch/lib"
|
|
}
|
|
|
|
test_inductor_cpp_wrapper_shard() {
|
|
if [[ -z "$NUM_TEST_SHARDS" ]]; then
|
|
echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
|
|
exit 1
|
|
fi
|
|
|
|
export TORCHINDUCTOR_CPP_WRAPPER=1
|
|
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
|
mkdir -p "$TEST_REPORTS_DIR"
|
|
|
|
# Run certain inductor unit tests with cpp wrapper. In the end state, we
|
|
# should be able to run all the inductor unit tests with cpp_wrapper.
|
|
#
|
|
# TODO: I'm pretty sure that "TestInductorOpInfoCPU" is not a valid filter,
|
|
# but change that in another PR to more accurately monitor the increased CI
|
|
# usage.
|
|
python test/run_test.py \
|
|
--include inductor/test_torchinductor_opinfo \
|
|
-k 'linalg or to_sparse or TestInductorOpInfoCPU' \
|
|
--shard "$1" "$NUM_TEST_SHARDS" \
|
|
--verbose
|
|
python test/run_test.py \
|
|
--include inductor/test_torchinductor inductor/test_max_autotune inductor/test_cpu_repro \
|
|
--shard "$1" "$NUM_TEST_SHARDS" \
|
|
--verbose
|
|
python test/run_test.py --inductor \
|
|
--include test_torch \
|
|
-k 'take' \
|
|
--shard "$1" "$NUM_TEST_SHARDS" \
|
|
--verbose
|
|
|
|
if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
|
|
python test/run_test.py \
|
|
--include inductor/test_mkldnn_pattern_matcher \
|
|
-k 'xpu' \
|
|
--shard "$1" "$NUM_TEST_SHARDS" \
|
|
--verbose
|
|
fi
|
|
}
|
|
|
|
# "Global" flags for inductor benchmarking controlled by TEST_CONFIG
|
|
# For example 'dynamic_aot_eager_torchbench' TEST_CONFIG means we run
|
|
# the benchmark script with '--dynamic-shapes --backend aot_eager --device cuda'
|
|
# The matrix of test options is specified in .github/workflows/inductor.yml,
|
|
# .github/workflows/inductor-periodic.yml, and
|
|
# .github/workflows/inductor-perf-test-nightly.yml
|
|
DYNAMO_BENCHMARK_FLAGS=()
|
|
|
|
pr_time_benchmarks() {
|
|
|
|
pip_install "fbscribelogger"
|
|
|
|
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
|
mkdir -p "$TEST_REPORTS_DIR"
|
|
PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
|
|
echo "benchmark results on current PR: "
|
|
cat "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"
|
|
PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks python benchmarks/dynamo/pr_time_benchmarks/check_results.py "benchmarks/dynamo/pr_time_benchmarks/expected_results.csv" "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "$TEST_REPORTS_DIR/new_expected_results.csv"
|
|
}
|
|
|
|
if [[ "${TEST_CONFIG}" == *pr_time_benchmarks* ]]; then
|
|
pr_time_benchmarks
|
|
exit 0
|
|
elif [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
|
|
DYNAMO_BENCHMARK_FLAGS+=(--backend eager)
|
|
elif [[ "${TEST_CONFIG}" == *aot_eager* ]]; then
|
|
DYNAMO_BENCHMARK_FLAGS+=(--backend aot_eager)
|
|
elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
|
|
DYNAMO_BENCHMARK_FLAGS+=(--export-aot-inductor)
|
|
elif [[ "${TEST_CONFIG}" == *max_autotune_inductor* ]]; then
|
|
DYNAMO_BENCHMARK_FLAGS+=(--inductor --inductor-compile-mode max-autotune)
|
|
elif [[ "${TEST_CONFIG}" == *inductor* && "${TEST_CONFIG}" != *perf* ]]; then
|
|
DYNAMO_BENCHMARK_FLAGS+=(--inductor)
|
|
fi
|
|
|
|
if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
|
|
DYNAMO_BENCHMARK_FLAGS+=(--dynamic-shapes --dynamic-batch-only)
|
|
fi
|
|
|
|
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
|
|
DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
|
|
else
|
|
DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
|
|
fi
|
|
|
|
test_cachebench() {
|
|
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
|
mkdir -p "$TEST_REPORTS_DIR"
|
|
|
|
local BENCHMARK
|
|
if [[ "${SHARD_NUMBER}" == 1 ]]; then
|
|
local BENCHMARK=torchbench
|
|
elif [[ "${SHARD_NUMBER}" == 2 ]]; then
|
|
local BENCHMARK=huggingface
|
|
else
|
|
echo "invalid SHARD_NUMBER: ${SHARD_NUMBER}"
|
|
exit 1
|
|
fi
|
|
|
|
local mode_options=("training" "inference")
|
|
|
|
for mode in "${mode_options[@]}"; do
|
|
$TASKSET python "benchmarks/dynamo/cachebench.py" \
|
|
--mode "$mode" \
|
|
--device cuda \
|
|
--benchmark "$BENCHMARK" \
|
|
--repeat 3 \
|
|
--output "$TEST_REPORTS_DIR/cachebench_${BENCHMARK}_${mode}.json"
|
|
|
|
$TASKSET python "benchmarks/dynamo/cachebench.py" \
|
|
--mode "$mode" \
|
|
--dynamic \
|
|
--device cuda \
|
|
--benchmark "$BENCHMARK" \
|
|
--repeat 3 \
|
|
--output "$TEST_REPORTS_DIR/cachebench_${BENCHMARK}_${mode}_dynamic.json"
|
|
done
|
|
}
|
|
|
|
test_verify_cachebench() {
|
|
TMP_TEST_REPORTS_DIR=$(mktemp -d)
|
|
TEST_OUTPUT="$TMP_TEST_REPORTS_DIR/test.json"
|
|
|
|
$TASKSET python "benchmarks/dynamo/cachebench.py" \
|
|
--mode training \
|
|
--device cpu \
|
|
--model nanogpt \
|
|
--benchmark torchbench \
|
|
--output "$TEST_OUTPUT"
|
|
|
|
# -s checks file exists and is non empty
|
|
if [[ ! -s "$TEST_OUTPUT" ]]; then
|
|
echo "Cachebench failed to produce an output."
|
|
echo "Run 'python benchmarks/dynamo/cachebench.py' to make sure it works"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
test_perf_for_dashboard() {
|
|
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
|
mkdir -p "$TEST_REPORTS_DIR"
|
|
|
|
local suite="$1"
|
|
shift
|
|
|
|
local backend=inductor
|
|
local modes=()
|
|
if [[ "$DASHBOARD_TAG" == *training-true* ]]; then
|
|
modes+=(training)
|
|
fi
|
|
if [[ "$DASHBOARD_TAG" == *inference-true* ]]; then
|
|
modes+=(inference)
|
|
fi
|
|
# TODO: All the accuracy tests can be skipped once the CI accuracy checking is stable enough
|
|
local targets=(accuracy performance)
|
|
|
|
local device=cuda
|
|
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
|
|
if [[ "${TEST_CONFIG}" == *cpu_x86_zen* ]]; then
|
|
device=cpu_x86_zen
|
|
elif [[ "${TEST_CONFIG}" == *cpu_x86* ]]; then
|
|
device=cpu_x86
|
|
elif [[ "${TEST_CONFIG}" == *cpu_aarch64* ]]; then
|
|
device=cpu_aarch64
|
|
fi
|
|
test_inductor_set_cpu_affinity
|
|
elif [[ "${TEST_CONFIG}" == *cuda_a10g* ]]; then
|
|
device=cuda_a10g
|
|
elif [[ "${TEST_CONFIG}" == *h100* ]]; then
|
|
device=cuda_h100
|
|
elif [[ "${TEST_CONFIG}" == *b200* ]]; then
|
|
device=cuda_b200
|
|
elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
|
|
device=rocm
|
|
fi
|
|
|
|
for mode in "${modes[@]}"; do
|
|
if [[ "$mode" == "inference" ]]; then
|
|
if [[ "$device" == "cpu_x86" ]]; then
|
|
dtype=amp
|
|
else
|
|
dtype=bfloat16
|
|
fi
|
|
elif [[ "$mode" == "training" ]]; then
|
|
dtype=amp
|
|
fi
|
|
for target in "${targets[@]}"; do
|
|
local target_flag=("--${target}")
|
|
if [[ "$target" == "performance" ]]; then
|
|
target_flag+=( --cold-start-latency)
|
|
elif [[ "$target" == "accuracy" ]]; then
|
|
target_flag+=( --no-translation-validation)
|
|
fi
|
|
|
|
if [[ "$DASHBOARD_TAG" == *freezing-true* ]]; then
|
|
target_flag+=( --freezing)
|
|
fi
|
|
|
|
if [[ "$DASHBOARD_TAG" == *default-true* ]]; then
|
|
$TASKSET python "benchmarks/dynamo/$suite.py" \
|
|
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
|
|
--output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"
|
|
fi
|
|
if [[ "$DASHBOARD_TAG" == *cudagraphs-true* ]]; then
|
|
$TASKSET python "benchmarks/dynamo/$suite.py" \
|
|
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
|
|
--output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"
|
|
fi
|
|
if [[ "$DASHBOARD_TAG" == *dynamic-true* ]]; then
|
|
$TASKSET python "benchmarks/dynamo/$suite.py" \
|
|
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes \
|
|
--dynamic-batch-only "$@" \
|
|
--output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}.csv"
|
|
fi
|
|
if [[ "$DASHBOARD_TAG" == *cppwrapper-true* ]]; then
|
|
TORCHINDUCTOR_CPP_WRAPPER=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
|
|
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
|
|
--output "$TEST_REPORTS_DIR/${backend}_cpp_wrapper_${suite}_${dtype}_${mode}_${device}_${target}.csv"
|
|
fi
|
|
if [[ "$DASHBOARD_TAG" == *freezing_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
|
|
$TASKSET python "benchmarks/dynamo/$suite.py" \
|
|
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
|
|
--output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_${suite}_${dtype}_${mode}_${device}_${target}.csv"
|
|
fi
|
|
if [[ "$DASHBOARD_TAG" == *freeze_autotune_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
|
|
TORCHINDUCTOR_MAX_AUTOTUNE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
|
|
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
|
|
--output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_autotune_${suite}_${dtype}_${mode}_${device}_${target}.csv"
|
|
fi
|
|
if [[ "$DASHBOARD_TAG" == *aotinductor-true* ]] && [[ "$mode" == "inference" ]]; then
|
|
if [[ "$target" == "accuracy" ]]; then
|
|
# Also collect Export pass rate and display as a separate row
|
|
$TASKSET python "benchmarks/dynamo/$suite.py" \
|
|
"${target_flag[@]}" --"$mode" --"$dtype" --export --disable-cudagraphs "$@" \
|
|
--output "$TEST_REPORTS_DIR/${backend}_export_${suite}_${dtype}_${mode}_${device}_${target}.csv"
|
|
fi
|
|
$TASKSET python "benchmarks/dynamo/$suite.py" \
|
|
"${target_flag[@]}" --"$mode" --"$dtype" --export-aot-inductor --disable-cudagraphs "$@" \
|
|
--output "$TEST_REPORTS_DIR/${backend}_aot_inductor_${suite}_${dtype}_${mode}_${device}_${target}.csv"
|
|
fi
|
|
if [[ "$DASHBOARD_TAG" == *maxautotune-true* ]]; then
|
|
TORCHINDUCTOR_MAX_AUTOTUNE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
|
|
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
|
|
--output "$TEST_REPORTS_DIR/${backend}_max_autotune_${suite}_${dtype}_${mode}_${device}_${target}.csv"
|
|
fi
|
|
if [[ "$DASHBOARD_TAG" == *cudagraphs_low_precision-true* ]] && [[ "$mode" == "inference" ]]; then
|
|
# TODO: This has a new dtype called quant and the benchmarks script needs to be updated to support this.
|
|
# The tentative command is as follows. It doesn't work now, but it's ok because we only need mock data
|
|
# to fill the dashboard.
|
|
$TASKSET python "benchmarks/dynamo/$suite.py" \
|
|
"${target_flag[@]}" --"$mode" --quant --backend "$backend" "$@" \
|
|
--output "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_${device}_${target}.csv" || true
|
|
# Copy cudagraph results as mock data, easiest choice?
|
|
cp "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv" \
|
|
"$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_${device}_${target}.csv"
|
|
fi
|
|
done
|
|
done
|
|
}
|
|
|
|
test_single_dynamo_benchmark() {
|
|
# Usage: test_single_dynamo_benchmark inductor_inference huggingface 0 --args-for-script
|
|
|
|
# Use test-reports directory under test folder will allow the CI to automatically pick up
|
|
# the test reports and upload them to S3. Need to use full path here otherwise the script
|
|
# will bark about file not found later on
|
|
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
|
mkdir -p "$TEST_REPORTS_DIR"
|
|
|
|
local name="$1"
|
|
shift
|
|
local suite="$1"
|
|
shift
|
|
# shard id is mandatory, even if it is not passed
|
|
local shard_id="$1"
|
|
shift
|
|
|
|
local partition_flags=()
|
|
if [[ -n "$NUM_TEST_SHARDS" && -n "$shard_id" ]]; then
|
|
partition_flags=( --total-partitions "$NUM_TEST_SHARDS" --partition-id "$shard_id" )
|
|
fi
|
|
|
|
if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
|
|
python "benchmarks/dynamo/$suite.py" \
|
|
--ci --performance --disable-cudagraphs --inductor \
|
|
"${DYNAMO_BENCHMARK_FLAGS[@]}" "$@" "${partition_flags[@]}" \
|
|
--output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
|
|
elif [[ "${TEST_CONFIG}" == *perf* ]]; then
|
|
test_perf_for_dashboard "$suite" \
|
|
"${DYNAMO_BENCHMARK_FLAGS[@]}" "$@" "${partition_flags[@]}"
|
|
else
|
|
if [[ "${TEST_CONFIG}" == *_avx2* ]]; then
|
|
TEST_CONFIG=${TEST_CONFIG//_avx2/}
|
|
fi
|
|
if [[ "${TEST_CONFIG}" == *_avx512* ]]; then
|
|
TEST_CONFIG=${TEST_CONFIG//_avx512/}
|
|
fi
|
|
python "benchmarks/dynamo/$suite.py" \
|
|
--ci --accuracy --timing --explain --print-compilation-time \
|
|
"${DYNAMO_BENCHMARK_FLAGS[@]}" \
|
|
"$@" "${partition_flags[@]}" \
|
|
--output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
|
|
python benchmarks/dynamo/check_accuracy.py \
|
|
--actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
|
|
--expected "benchmarks/dynamo/ci_expected_accuracy/${MAYBE_ROCM}${TEST_CONFIG}_${name}.csv"
|
|
python benchmarks/dynamo/check_graph_breaks.py \
|
|
--actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
|
|
--expected "benchmarks/dynamo/ci_expected_accuracy/${MAYBE_ROCM}${TEST_CONFIG}_${name}.csv"
|
|
fi
|
|
}
|
|
|
|
test_inductor_micro_benchmark() {
|
|
# torchao requires cuda 8.0 or above for bfloat16 support
|
|
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
|
|
export TORCH_CUDA_ARCH_LIST="8.0;8.6"
|
|
fi
|
|
install_torchao
|
|
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
|
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
|
|
test_inductor_set_cpu_affinity
|
|
fi
|
|
python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
|
|
}
|
|
|
|
test_inductor_halide() {
|
|
python test/run_test.py --include inductor/test_halide.py --verbose
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_inductor_triton_cpu() {
|
|
python test/run_test.py --include inductor/test_triton_cpu_backend.py inductor/test_torchinductor_strided_blocks.py --verbose
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_dynamo_benchmark() {
|
|
# Usage: test_dynamo_benchmark huggingface 0
|
|
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
|
|
|
local suite="$1"
|
|
shift
|
|
local shard_id="$1"
|
|
shift
|
|
|
|
if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
|
|
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
|
|
elif [[ "${TEST_CONFIG}" == *perf* ]]; then
|
|
# TODO (huydhn): Just smoke test some sample models
|
|
if [[ "${TEST_CONFIG}" == *b200* ]]; then
|
|
if [[ "${suite}" == "huggingface" ]]; then
|
|
export TORCHBENCH_ONLY_MODELS="DistillGPT2"
|
|
elif [[ "${suite}" == "timm_models" ]]; then
|
|
export TORCHBENCH_ONLY_MODELS="inception_v3"
|
|
elif [[ "${suite}" == "torchbench" ]]; then
|
|
export TORCHBENCH_ONLY_MODELS="BERT_pytorch"
|
|
fi
|
|
fi
|
|
test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
|
|
else
|
|
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
|
|
local dt="float32"
|
|
if [[ "${TEST_CONFIG}" == *amp* ]]; then
|
|
dt="amp"
|
|
fi
|
|
if [[ "${TEST_CONFIG}" == *freezing* ]]; then
|
|
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" --freezing "$@"
|
|
else
|
|
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" "$@"
|
|
fi
|
|
elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
|
|
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
|
|
elif [[ "${TEST_CONFIG}" == *max_autotune_inductor* ]]; then
|
|
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
|
|
else
|
|
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
|
|
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
test_inductor_torchbench_smoketest_perf() {
|
|
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
|
mkdir -p "$TEST_REPORTS_DIR"
|
|
|
|
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
|
|
--batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only BERT_pytorch \
|
|
--output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
|
|
# The threshold value needs to be actively maintained to make this check useful
|
|
python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4
|
|
|
|
# Check memory compression ratio for a few models
|
|
for test in BERT_pytorch yolov3; do
|
|
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
|
|
--disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \
|
|
--only $test --output "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv"
|
|
cat "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv"
|
|
python benchmarks/dynamo/check_memory_compression_ratio.py --actual \
|
|
"$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv" \
|
|
--expected benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
|
|
done
|
|
|
|
# Perform some "warm-start" runs for a few huggingface models.
|
|
for test in AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
|
|
python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
|
|
--only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
|
|
python benchmarks/dynamo/check_accuracy.py \
|
|
--actual "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" \
|
|
--expected "benchmarks/dynamo/ci_expected_accuracy/${MAYBE_ROCM}inductor_huggingface_training.csv"
|
|
done
|
|
}
|
|
|
|
test_inductor_set_cpu_affinity(){
|
|
JEMALLOC_LIB="$(find /usr/lib -name libjemalloc.so.2)"
|
|
export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
|
|
export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
|
|
|
|
if [[ "$(uname -m)" != "aarch64" ]]; then
|
|
# Use Intel OpenMP for x86
|
|
IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
|
|
export LD_PRELOAD="$IOMP_LIB":"$LD_PRELOAD"
|
|
export KMP_AFFINITY=granularity=fine,compact,1,0
|
|
export KMP_BLOCKTIME=1
|
|
fi
|
|
|
|
# Use nproc here instead of lscpu because it takes into account cgroups slice
|
|
cpus=$(nproc)
|
|
thread_per_core=$(lscpu | grep 'Thread(s) per core:' | awk '{print $4}')
|
|
cores=$((cpus / thread_per_core))
|
|
|
|
# Set number of cores to 16 on aarch64 for performance runs
|
|
if [[ "$(uname -m)" == "aarch64" && $cores -gt 16 ]]; then
|
|
cores=16
|
|
fi
|
|
export OMP_NUM_THREADS=$cores
|
|
|
|
# Handle cgroups slice start and end CPU
|
|
start_cpu=$(python -c 'import os; print(min(os.sched_getaffinity(0)))')
|
|
# Leaving one physical CPU for other tasks
|
|
end_cpu=$(($(python -c 'import os; print(max(os.sched_getaffinity(0)))') - thread_per_core))
|
|
export TASKSET="taskset -c $start_cpu-$end_cpu"
|
|
}
|
|
|
|
test_inductor_torchbench_cpu_smoketest_perf(){
|
|
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
|
mkdir -p "$TEST_REPORTS_DIR"
|
|
|
|
test_inductor_set_cpu_affinity
|
|
MODELS_SPEEDUP_TARGET=benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
|
|
|
|
grep -v '^ *#' < "$MODELS_SPEEDUP_TARGET" | while IFS=',' read -r -a model_cfg
|
|
do
|
|
local model_name=${model_cfg[0]}
|
|
local data_type=${model_cfg[2]}
|
|
local speedup_target=${model_cfg[5]}
|
|
local backend=${model_cfg[1]}
|
|
if [[ ${model_cfg[4]} == "cpp" ]]; then
|
|
export TORCHINDUCTOR_CPP_WRAPPER=1
|
|
else
|
|
unset TORCHINDUCTOR_CPP_WRAPPER
|
|
fi
|
|
local output_name="$TEST_REPORTS_DIR/inductor_inference_${model_cfg[0]}_${model_cfg[1]}_${model_cfg[2]}_${model_cfg[3]}_cpu_smoketest.csv"
|
|
|
|
if [[ ${model_cfg[3]} == "dynamic" ]]; then
|
|
$TASKSET python benchmarks/dynamo/torchbench.py \
|
|
--inference --performance --"$data_type" -dcpu -n50 --only "$model_name" --dynamic-shapes \
|
|
--dynamic-batch-only --freezing --timeout 9000 --"$backend" --output "$output_name"
|
|
else
|
|
$TASKSET python benchmarks/dynamo/torchbench.py \
|
|
--inference --performance --"$data_type" -dcpu -n50 --only "$model_name" \
|
|
--freezing --timeout 9000 --"$backend" --output "$output_name"
|
|
fi
|
|
cat "$output_name"
|
|
# The threshold value needs to be actively maintained to make this check useful.
|
|
# Allow 1% variance for CPU perf to accommodate perf fluctuation
|
|
python benchmarks/dynamo/check_perf_csv.py -f "$output_name" -t "$speedup_target" -s 0.99
|
|
done
|
|
}
|
|
|
|
test_torchbench_gcp_smoketest(){
|
|
pushd "${TORCHBENCHPATH}"
|
|
python test.py -v
|
|
popd
|
|
}
|
|
|
|
test_aten() {
|
|
# Test ATen
|
|
# The following test(s) of ATen have already been skipped by caffe2 in rocm environment:
|
|
# scalar_tensor_test, basic, native_test
|
|
echo "Running ATen tests with pytorch lib"
|
|
|
|
if [[ -n "$IN_WHEEL_TEST" ]]; then
|
|
echo "Running test with the install folder"
|
|
# Rename the build folder when running test to ensure it
|
|
# is not depended on the folder
|
|
mv "$BUILD_DIR" "$BUILD_RENAMED_DIR"
|
|
TEST_BASE_DIR="$TORCH_TEST_DIR"
|
|
else
|
|
echo "Running test with the build folder"
|
|
TEST_BASE_DIR="$BUILD_BIN_DIR"
|
|
fi
|
|
|
|
# NB: the ATen test binaries don't have RPATH set, so it's necessary to
|
|
# put the dynamic libraries somewhere were the dynamic linker can find them.
|
|
# This is a bit of a hack.
|
|
${SUDO} ln -sf "$TORCH_LIB_DIR"/libc10* "$TEST_BASE_DIR"
|
|
${SUDO} ln -sf "$TORCH_LIB_DIR"/libcaffe2* "$TEST_BASE_DIR"
|
|
${SUDO} ln -sf "$TORCH_LIB_DIR"/libmkldnn* "$TEST_BASE_DIR"
|
|
${SUDO} ln -sf "$TORCH_LIB_DIR"/libnccl* "$TEST_BASE_DIR"
|
|
${SUDO} ln -sf "$TORCH_LIB_DIR"/libtorch* "$TEST_BASE_DIR"
|
|
|
|
ls "$TEST_BASE_DIR"
|
|
aten/tools/run_tests.sh "$TEST_BASE_DIR"
|
|
|
|
if [[ -n "$IN_WHEEL_TEST" ]]; then
|
|
# Restore the build folder to avoid any impact on other tests
|
|
mv "$BUILD_RENAMED_DIR" "$BUILD_DIR"
|
|
fi
|
|
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_without_numpy() {
|
|
pushd "$(dirname "${BASH_SOURCE[0]}")"
|
|
python -c "import sys;sys.path.insert(0, 'fake_numpy');from unittest import TestCase;import torch;x=torch.randn(3,3);TestCase().assertRaises(RuntimeError, lambda: x.numpy())"
|
|
# Regression test for https://github.com/pytorch/pytorch/issues/66353
|
|
python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;print(torch.tensor([torch.tensor(0.), torch.tensor(1.)]))"
|
|
# Regression test for https://github.com/pytorch/pytorch/issues/109387
|
|
if [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
|
|
python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;torch.compile(lambda x:print(x))('Hello World')"
|
|
fi
|
|
# Regression test for https://github.com/pytorch/pytorch/pull/157734 (torch.onnx should be importable without numpy)
|
|
python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch; import torch.onnx"
|
|
popd
|
|
}
|
|
|
|
test_libtorch() {
|
|
local SHARD="$1"
|
|
|
|
# The slow test config corresponds to a default test config that should run
|
|
# the libtorch tests instead.
|
|
if [[ "$TEST_CONFIG" != "slow" ]]; then
|
|
echo "Testing libtorch"
|
|
ln -sf "$TORCH_LIB_DIR"/libbackend_with_compiler.so "$TORCH_BIN_DIR"
|
|
ln -sf "$TORCH_LIB_DIR"/libjitbackend_test.so "$TORCH_BIN_DIR"
|
|
ln -sf "$TORCH_LIB_DIR"/libcaffe2_nvrtc.so "$TORCH_BIN_DIR"
|
|
ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
|
|
ln -sf "$TORCH_LIB_DIR"/libshm* "$TORCH_BIN_DIR"
|
|
ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
|
|
ln -sf "$TORCH_LIB_DIR"/libnvfuser* "$TORCH_BIN_DIR"
|
|
|
|
export CPP_TESTS_DIR="${TORCH_BIN_DIR}"
|
|
|
|
if [[ -z "${SHARD}" || "${SHARD}" == "1" ]]; then
|
|
test_libtorch_api
|
|
fi
|
|
|
|
if [[ -z "${SHARD}" || "${SHARD}" == "2" ]]; then
|
|
test_libtorch_jit
|
|
fi
|
|
|
|
assert_git_not_dirty
|
|
fi
|
|
}
|
|
|
|
test_libtorch_jit() {
|
|
# Prepare the model used by test_jit, the model needs to be in the test directory
|
|
# to get picked up by run_test
|
|
pushd test
|
|
python cpp/jit/tests_setup.py setup
|
|
popd
|
|
|
|
# Run jit and lazy tensor cpp tests together to finish them faster
|
|
if [[ "$BUILD_ENVIRONMENT" == *cuda* && "$TEST_CONFIG" != *nogpu* ]]; then
|
|
LTC_TS_CUDA=1 python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/test_lazy
|
|
else
|
|
# CUDA tests have already been skipped when CUDA is not available
|
|
python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/test_lazy -k "not CUDA"
|
|
fi
|
|
|
|
# Cleaning up test artifacts in the test folder
|
|
pushd test
|
|
python cpp/jit/tests_setup.py shutdown
|
|
popd
|
|
}
|
|
|
|
test_libtorch_api() {
|
|
# Start background download
|
|
MNIST_DIR="${PWD}/test/cpp/api/mnist"
|
|
python tools/download_mnist.py --quiet -d "${MNIST_DIR}"
|
|
|
|
if [[ "$BUILD_ENVIRONMENT" == *asan* || "$BUILD_ENVIRONMENT" == *slow-gradcheck* ]]; then
|
|
TEST_REPORTS_DIR=test/test-reports/cpp-unittest/test_libtorch
|
|
mkdir -p $TEST_REPORTS_DIR
|
|
|
|
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
|
|
else
|
|
# Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
|
|
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"
|
|
|
|
fi
|
|
|
|
# quantization is not fully supported on s390x yet
|
|
if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* && "${BUILD_ENVIRONMENT}" != *asan* && "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
|
|
# NB: This test is not under TORCH_BIN_DIR but under BUILD_BIN_DIR
|
|
export CPP_TESTS_DIR="${BUILD_BIN_DIR}"
|
|
python test/run_test.py --cpp --verbose -i cpp/static_runtime_test
|
|
fi
|
|
}
|
|
|
|
test_xpu_bin(){
|
|
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
|
mkdir -p "$TEST_REPORTS_DIR"
|
|
|
|
for xpu_case in "${BUILD_BIN_DIR}"/*{xpu,sycl}*; do
|
|
if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then
|
|
case_name=$(basename "$xpu_case")
|
|
echo "Testing ${case_name} ..."
|
|
"$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml
|
|
fi
|
|
done
|
|
}
|
|
|
|
test_aot_compilation() {
|
|
echo "Testing Ahead of Time compilation"
|
|
ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
|
|
ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
|
|
|
|
if [ -f "$TORCH_BIN_DIR"/test_mobile_nnc ]; then
|
|
CPP_TESTS_DIR="${TORCH_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_mobile_nnc
|
|
fi
|
|
|
|
if [ -f "$TORCH_BIN_DIR"/aot_model_compiler_test ]; then
|
|
source test/mobile/nnc/test_aot_compile.sh
|
|
fi
|
|
}
|
|
|
|
test_vulkan() {
|
|
if [[ "$BUILD_ENVIRONMENT" == *vulkan* ]]; then
|
|
ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_TEST_DIR"
|
|
ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_TEST_DIR"
|
|
export VK_ICD_FILENAMES=/var/lib/jenkins/swiftshader/swiftshader/build/Linux/vk_swiftshader_icd.json
|
|
CPP_TESTS_DIR="${TORCH_TEST_DIR}" LD_LIBRARY_PATH=/var/lib/jenkins/swiftshader/swiftshader/build/Linux/ python test/run_test.py --cpp --verbose -i cpp/vulkan_api_test
|
|
fi
|
|
}
|
|
|
|
test_distributed() {
|
|
echo "Testing distributed python tests"
|
|
# shellcheck disable=SC2086
|
|
time python test/run_test.py --distributed-tests --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" $INCLUDE_CLAUSE --verbose
|
|
assert_git_not_dirty
|
|
|
|
if [[ ("$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm*) && "$SHARD_NUMBER" == 1 ]]; then
|
|
echo "Testing distributed C++ tests"
|
|
ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
|
|
ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
|
|
|
|
export CPP_TESTS_DIR="${TORCH_BIN_DIR}"
|
|
# These are distributed tests, so let's continue running them sequentially here to avoid
|
|
# any surprise
|
|
python test/run_test.py --cpp --verbose -i cpp/FileStoreTest
|
|
python test/run_test.py --cpp --verbose -i cpp/HashStoreTest
|
|
python test/run_test.py --cpp --verbose -i cpp/TCPStoreTest
|
|
|
|
echo "Testing multi-GPU linalg tests"
|
|
python test/run_test.py -i test_linalg.py -k test_matmul_offline_mgpu_tunable --verbose
|
|
|
|
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
|
|
MPIEXEC=$(command -v mpiexec)
|
|
if [[ -n "$MPIEXEC" ]]; then
|
|
# NB: mpiexec only works directly with the C++ test binary here
|
|
MPICMD="${MPIEXEC} -np 2 $TORCH_BIN_DIR/ProcessGroupMPITest"
|
|
eval "$MPICMD"
|
|
fi
|
|
|
|
python test/run_test.py --cpp --verbose -i cpp/ProcessGroupGlooTest
|
|
python test/run_test.py --cpp --verbose -i cpp/ProcessGroupNCCLTest
|
|
python test/run_test.py --cpp --verbose -i cpp/ProcessGroupNCCLErrorsTest
|
|
fi
|
|
fi
|
|
}
|
|
|
|
test_quantization() {
|
|
echo "Testing quantization"
|
|
|
|
python test/test_quantization.py
|
|
}
|
|
|
|
test_rpc() {
|
|
echo "Testing RPC C++ tests"
|
|
# NB: the ending test_rpc must match the current function name for the current
|
|
# test reporting process to function as expected.
|
|
ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
|
|
ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
|
|
|
|
CPP_TESTS_DIR="${TORCH_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_cpp_rpc
|
|
}
|
|
|
|
test_custom_backend() {
|
|
echo "Testing custom backends"
|
|
CUSTOM_BACKEND_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/custom-backend-build"
|
|
pushd test/custom_backend
|
|
cp -a "$CUSTOM_BACKEND_BUILD" build
|
|
# Run tests Python-side and export a lowered module.
|
|
python test_custom_backend.py -v
|
|
python backend.py --export-module-to=model.pt
|
|
# Run tests C++-side and load the exported lowered module.
|
|
build/test_custom_backend ./model.pt
|
|
rm -f ./model.pt
|
|
popd
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_custom_script_ops() {
|
|
echo "Testing custom script operators"
|
|
|
|
if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then
|
|
echo "Skipping custom script operators until it's fixed"
|
|
return 0
|
|
fi
|
|
|
|
CUSTOM_OP_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/custom-op-build"
|
|
pushd test/custom_operator
|
|
cp -a "$CUSTOM_OP_BUILD" build
|
|
# Run tests Python-side and export a script module.
|
|
python test_custom_ops.py -v
|
|
python model.py --export-script-module=model.pt
|
|
# Run tests C++-side and load the exported script module.
|
|
build/test_custom_ops ./model.pt
|
|
popd
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_jit_hooks() {
|
|
echo "Testing jit hooks in cpp"
|
|
HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
|
|
pushd test/jit_hooks
|
|
cp -a "$HOOK_BUILD" build
|
|
# Run tests Python-side and export the script modules with hooks
|
|
python model.py --export-script-module=model
|
|
# Run tests C++-side and load the exported script modules
|
|
build/test_jit_hooks ./model
|
|
popd
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_torch_function_benchmark() {
|
|
echo "Testing __torch_function__ benchmarks"
|
|
pushd benchmarks/overrides_benchmark
|
|
python bench.py -n 1 -m 2
|
|
python pyspybench.py Tensor -n 1
|
|
python pyspybench.py SubTensor -n 1
|
|
python pyspybench.py WithTorchFunction -n 1
|
|
python pyspybench.py SubWithTorchFunction -n 1
|
|
popd
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
build_xla() {
|
|
# xla test needs pytorch headers in torch/include
|
|
pushd ..
|
|
python -c "import os, torch, shutil; shutil.copytree(os.path.join(os.path.dirname(torch.__file__), 'include'), 'workspace/torch/include', dirs_exist_ok=True)"
|
|
popd
|
|
|
|
# xla test needs sccache setup.
|
|
# shellcheck source=./common-build.sh
|
|
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
|
|
|
|
XLA_DIR=xla
|
|
USE_CACHE=1
|
|
clone_pytorch_xla
|
|
# shellcheck disable=SC1091
|
|
source "xla/.circleci/common.sh"
|
|
|
|
# TODO: The torch pin #73164 is involved in the sev https://github.com/pytorch/pytorch/issues/86093
|
|
# so this is temporarily removed until XLA fixes the weird logic in https://github.com/pytorch/xla/blob/master/scripts/apply_patches.sh#L17-L18
|
|
rm "${XLA_DIR}/torch_patches/.torch_pin" || true
|
|
|
|
apply_patches
|
|
SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
|
|
# These functions are defined in .circleci/common.sh in pytorch/xla repo
|
|
retry install_pre_deps_pytorch_xla $XLA_DIR $USE_CACHE
|
|
CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch:${CMAKE_PREFIX_PATH}" XLA_SANDBOX_BUILD=1 build_torch_xla $XLA_DIR
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_xla() {
|
|
# xla test needs sccache setup.
|
|
# shellcheck source=./common-build.sh
|
|
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
|
|
|
|
clone_pytorch_xla
|
|
# shellcheck disable=SC1091
|
|
source "./xla/.circleci/common.sh"
|
|
SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
|
|
# Set LD_LIBRARY_PATH for C++ tests
|
|
export LD_LIBRARY_PATH="/opt/conda/lib/:${LD_LIBRARY_PATH}"
|
|
CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch:${CMAKE_PREFIX_PATH}" XLA_SKIP_MP_OP_TESTS=1 run_torch_xla_tests "$(pwd)" "$(pwd)/xla"
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
function check_public_api_test_fails {
|
|
test_name=$1
|
|
invalid_item_name=$2
|
|
invalid_item_desc=$3
|
|
|
|
echo "Running public API test '${test_name}'..."
|
|
test_output=$(python test/test_public_bindings.py -k "${test_name}" 2>&1) && ret=$? || ret=$?
|
|
|
|
# Ensure test fails correctly.
|
|
if [ "$ret" -eq 0 ]; then
|
|
cat << EOF
|
|
Expected the public API test '${test_name}' to fail after introducing
|
|
${invalid_item_desc}, but it succeeded! Check test/test_public_bindings.py
|
|
for any changes that may have broken the test.
|
|
EOF
|
|
return 1
|
|
fi
|
|
|
|
# Ensure invalid item is in the test output.
|
|
echo "${test_output}" | grep -q "${invalid_item_name}" && ret=$? || ret=$?
|
|
|
|
if [ $ret -ne 0 ]; then
|
|
cat << EOF
|
|
Expected the public API test '${test_name}' to identify ${invalid_item_desc}, but
|
|
it didn't! It's possible the test may not have run. Check test/test_public_bindings.py
|
|
for any changes that may have broken the test.
|
|
EOF
|
|
return 1
|
|
fi
|
|
|
|
echo "Success! '${test_name}' identified ${invalid_item_desc} ${invalid_item_name}."
|
|
return 0
|
|
}
|
|
|
|
# Do NOT run this test before any other tests, like test_python_shard, etc.
|
|
# Because this function uninstalls the torch built from branch and installs
|
|
# the torch built on its base commit.
|
|
test_forward_backward_compatibility() {
|
|
set -x
|
|
|
|
# First, validate public API tests in the torch built from branch.
|
|
# Step 1. Make sure the public API test "test_correct_module_names" fails when a new file
|
|
# introduces an invalid public API function.
|
|
new_filename=$(mktemp XXXXXXXX.py -p "${TORCH_INSTALL_DIR}")
|
|
|
|
BAD_PUBLIC_FUNC=$(
|
|
cat << 'EOF'
|
|
def new_public_func():
|
|
pass
|
|
|
|
# valid public API functions have __module__ set correctly
|
|
new_public_func.__module__ = None
|
|
EOF
|
|
)
|
|
|
|
echo "${BAD_PUBLIC_FUNC}" >> "${new_filename}"
|
|
invalid_api="torch.$(basename -s '.py' "${new_filename}").new_public_func"
|
|
echo "Created an invalid public API function ${invalid_api}..."
|
|
|
|
check_public_api_test_fails \
|
|
"test_correct_module_names" \
|
|
"${invalid_api}" \
|
|
"an invalid public API function" && ret=$? || ret=$?
|
|
|
|
rm -v "${new_filename}"
|
|
|
|
if [ "$ret" -ne 0 ]; then
|
|
exit 1
|
|
fi
|
|
|
|
# Step 2. Make sure that the public API test "test_correct_module_names" fails when an existing
|
|
# file is modified to introduce an invalid public API function.
|
|
# The filepath here must not have __all__ defined in it, otherwise the test will pass.
|
|
# If your PR introduces __all__ to torch/cuda/streams.py please point this to another file
|
|
# that does not have __all__ defined.
|
|
EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/cuda/streams.py"
|
|
cp -v "${EXISTING_FILEPATH}" "${EXISTING_FILEPATH}.orig"
|
|
echo "${BAD_PUBLIC_FUNC}" >> "${EXISTING_FILEPATH}"
|
|
invalid_api="torch.cuda.streams.new_public_func"
|
|
echo "Appended an invalid public API function to existing file ${EXISTING_FILEPATH}..."
|
|
|
|
check_public_api_test_fails \
|
|
"test_correct_module_names" \
|
|
"${invalid_api}" \
|
|
"an invalid public API function" && ret=$? || ret=$?
|
|
|
|
mv -v "${EXISTING_FILEPATH}.orig" "${EXISTING_FILEPATH}"
|
|
|
|
if [ "$ret" -ne 0 ]; then
|
|
exit 1
|
|
fi
|
|
|
|
# Step 3. Make sure that the public API test "test_modules_can_be_imported" fails when a module
|
|
# cannot be imported.
|
|
new_module_dir=$(mktemp XXXXXXXX -d -p "${TORCH_INSTALL_DIR}")
|
|
echo "invalid syntax garbage" > "${new_module_dir}/__init__.py"
|
|
invalid_module_name="torch.$(basename "${new_module_dir}")"
|
|
|
|
check_public_api_test_fails \
|
|
"test_modules_can_be_imported" \
|
|
"${invalid_module_name}" \
|
|
"a non-importable module" && ret=$? || ret=$?
|
|
|
|
rm -rv "${new_module_dir}"
|
|
|
|
if [ "$ret" -ne 0 ]; then
|
|
exit 1
|
|
fi
|
|
|
|
# Next, build torch from the merge base.
|
|
REPO_DIR=$(pwd)
|
|
if [[ "${BASE_SHA}" == "${SHA1}" ]]; then
|
|
echo "On trunk, we should compare schemas with torch built from the parent commit"
|
|
SHA_TO_COMPARE=$(git rev-parse "${SHA1}"^)
|
|
else
|
|
echo "On pull, we should compare schemas with torch built from the merge base"
|
|
SHA_TO_COMPARE=$(git merge-base "${SHA1}" "${BASE_SHA}")
|
|
fi
|
|
export SHA_TO_COMPARE
|
|
|
|
# create a dummy ts model at this version
|
|
python test/create_dummy_torchscript_model.py /tmp/model_new.pt
|
|
python -m venv venv
|
|
# shellcheck disable=SC1091
|
|
. venv/bin/activate
|
|
|
|
# build torch at the base commit to generate a base function schema for comparison
|
|
git reset --hard "${SHA_TO_COMPARE}"
|
|
git submodule sync && git submodule update --init --recursive
|
|
echo "::group::Installing Torch From Base Commit"
|
|
pip3 install -r requirements.txt
|
|
# shellcheck source=./common-build.sh
|
|
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
|
|
python -m build --wheel --no-isolation -C--build-option=--bdist-dir="base_bdist_tmp" --outdir "base_dist"
|
|
python -mpip install base_dist/*.whl
|
|
echo "::endgroup::"
|
|
|
|
pushd test/forward_backward_compatibility
|
|
pip show torch
|
|
python dump_all_function_schemas.py --filename nightly_schemas.txt
|
|
|
|
git reset --hard "${SHA1}"
|
|
git submodule sync && git submodule update --init --recursive
|
|
# FC: verify new model can be load with old code.
|
|
if ! python ../load_torchscript_model.py /tmp/model_new.pt; then
|
|
echo "FC check failed: new model cannot be load in old code"
|
|
return 1
|
|
fi
|
|
python ../create_dummy_torchscript_model.py /tmp/model_old.pt
|
|
deactivate
|
|
rm -r "${REPO_DIR}/venv" "${REPO_DIR}/base_dist"
|
|
pip show torch
|
|
python check_forward_backward_compatibility.py --existing-schemas nightly_schemas.txt
|
|
# BC: verify old model can be load with new code
|
|
if ! python ../load_torchscript_model.py /tmp/model_old.pt; then
|
|
echo "BC check failed: old model cannot be load in new code"
|
|
return 1
|
|
fi
|
|
popd
|
|
set +x
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_bazel() {
|
|
set -e -o pipefail
|
|
|
|
# bazel test needs sccache setup.
|
|
# shellcheck source=./common-build.sh
|
|
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
|
|
|
|
get_bazel
|
|
|
|
if [[ "$CUDA_VERSION" == "cpu" ]]; then
|
|
# Test //c10/... without Google flags and logging libraries. The
|
|
# :all_tests target in the subsequent Bazel invocation tests
|
|
# //c10/... with the Google libraries.
|
|
tools/bazel test --config=cpu-only --test_timeout=480 --test_output=all --test_tag_filters=-gpu-required --test_filter=-*CUDA \
|
|
--no//c10:use_gflags --no//c10:use_glog //c10/...
|
|
|
|
tools/bazel test --config=cpu-only --test_timeout=480 --test_output=all --test_tag_filters=-gpu-required --test_filter=-*CUDA :all_tests
|
|
else
|
|
# Increase the test timeout to 480 like CPU tests because modules_test frequently timeout
|
|
tools/bazel test --test_timeout=480 --test_output=errors \
|
|
//:any_test \
|
|
//:autograd_test \
|
|
//:dataloader_test \
|
|
//:dispatch_test \
|
|
//:enum_test \
|
|
//:expanding_array_test \
|
|
//:fft_test \
|
|
//:functional_test \
|
|
//:grad_mode_test \
|
|
//:inference_mode_test \
|
|
//:init_test \
|
|
//:jit_test \
|
|
//:memory_test \
|
|
//:meta_tensor_test \
|
|
//:misc_test \
|
|
//:moduledict_test \
|
|
//:modulelist_test \
|
|
//:modules_test \
|
|
//:namespace_test \
|
|
//:nested_test \
|
|
//:nn_utils_test \
|
|
//:operations_test \
|
|
//:ordered_dict_test \
|
|
//:parallel_benchmark_test \
|
|
//:parameterdict_test \
|
|
//:parameterlist_test \
|
|
//:sequential_test \
|
|
//:serialize_test \
|
|
//:special_test \
|
|
//:static_test \
|
|
//:support_test \
|
|
//:tensor_flatten_test \
|
|
//:tensor_indexing_test \
|
|
//:tensor_options_cuda_test \
|
|
//:tensor_options_test \
|
|
//:tensor_test \
|
|
//:torch_dist_autograd_test \
|
|
//:torch_include_test \
|
|
//:transformer_test \
|
|
//:test_bazel \
|
|
//c10/cuda/test:test \
|
|
//c10/test:core_tests \
|
|
//c10/test:typeid_test \
|
|
//c10/test:util/ssize_test \
|
|
//c10/test:util_base_tests
|
|
fi
|
|
}
|
|
|
|
test_benchmarks() {
|
|
if [[ "$BUILD_ENVIRONMENT" == *cuda* && $TEST_CONFIG != *nogpu* ]]; then
|
|
pip_install "pytest-benchmark==3.2.3"
|
|
pip_install "requests"
|
|
BENCHMARK_DATA="benchmarks/.data"
|
|
mkdir -p ${BENCHMARK_DATA}
|
|
pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_default.json --fuser=default --executor=default
|
|
pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_legacy_old.json --fuser=old --executor=legacy
|
|
pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_profiling_te.json --fuser=te --executor=profiling
|
|
# TODO: Enable these for GHA once we have credentials for forked pull requests
|
|
if [[ -z "${GITHUB_ACTIONS}" ]]; then
|
|
python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_default.json
|
|
python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_legacy_old.json
|
|
python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_profiling_te.json
|
|
fi
|
|
assert_git_not_dirty
|
|
fi
|
|
}
|
|
|
|
test_cpp_extensions() {
|
|
# This is to test whether cpp extension build is compatible with current env. No need to test both ninja and no-ninja build
|
|
time python test/run_test.py --include test_cpp_extensions_aot_ninja --verbose
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_vec256() {
|
|
# This is to test vec256 instructions DEFAULT/AVX/AVX2 (platform dependent, some platforms might not support AVX/AVX2)
|
|
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
|
|
echo "Testing vec256 instructions"
|
|
mkdir -p test/test-reports/vec256
|
|
pushd build/bin
|
|
vec256_tests=$(find . -maxdepth 1 -executable -name 'vec256_test*')
|
|
for vec256_exec in $vec256_tests
|
|
do
|
|
$vec256_exec --gtest_output=xml:test/test-reports/vec256/"$vec256_exec".xml
|
|
done
|
|
popd
|
|
assert_git_not_dirty
|
|
fi
|
|
}
|
|
|
|
test_docs_test() {
|
|
.ci/pytorch/docs-test.sh
|
|
}
|
|
|
|
test_executorch() {
|
|
echo "Install torchvision and torchaudio"
|
|
install_torchvision
|
|
install_torchaudio
|
|
|
|
INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh"
|
|
|
|
pushd /executorch
|
|
"${INSTALL_SCRIPT}" setup_executorch
|
|
|
|
echo "Run ExecuTorch unit tests"
|
|
pytest -v -n auto
|
|
# shellcheck disable=SC1091
|
|
LLVM_PROFDATA=llvm-profdata-12 LLVM_COV=llvm-cov-12 bash test/run_oss_cpp_tests.sh
|
|
|
|
echo "Run ExecuTorch regression tests for some models"
|
|
# TODO(huydhn): Add more coverage here using ExecuTorch's gather models script
|
|
# shellcheck disable=SC1091
|
|
source .ci/scripts/test_model.sh mv3 cmake xnnpack-quantization-delegation ''
|
|
|
|
popd
|
|
|
|
assert_git_not_dirty
|
|
}
|
|
|
|
test_linux_aarch64() {
|
|
python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
|
|
test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
|
|
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \
|
|
distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
|
|
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
|
|
|
|
# Dynamo tests
|
|
python test/run_test.py --include dynamo/test_compile dynamo/test_backends dynamo/test_comptime dynamo/test_config \
|
|
dynamo/test_functions dynamo/test_fx_passes_pre_grad dynamo/test_interop dynamo/test_model_output dynamo/test_modules \
|
|
dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles \
|
|
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
|
|
|
|
# Inductor tests
|
|
python test/run_test.py --include inductor/test_torchinductor inductor/test_benchmark_fusion inductor/test_codecache \
|
|
inductor/test_config inductor/test_control_flow inductor/test_coordinate_descent_tuner inductor/test_fx_fusion \
|
|
inductor/test_group_batch_fusion inductor/test_inductor_freezing inductor/test_inductor_utils \
|
|
inductor/test_inplacing_pass inductor/test_kernel_benchmark inductor/test_layout_optim \
|
|
inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
|
|
inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
|
|
inductor/test_split_cat_fx_passes inductor/test_compile inductor/test_torchinductor \
|
|
inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes inductor/test_memory \
|
|
inductor/test_triton_cpu_backend inductor/test_triton_extension_backend inductor/test_mkldnn_pattern_matcher inductor/test_cpu_cpp_wrapper \
|
|
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
|
|
}
|
|
|
|
test_operator_benchmark() {
|
|
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
|
mkdir -p "$TEST_REPORTS_DIR"
|
|
TEST_DIR=$(pwd)
|
|
ARCH=$(uname -m)
|
|
|
|
test_inductor_set_cpu_affinity
|
|
|
|
cd benchmarks/operator_benchmark/pt_extension
|
|
python -m pip install . -v --no-build-isolation
|
|
|
|
cd "${TEST_DIR}"/benchmarks/operator_benchmark
|
|
$TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \
|
|
--output-csv "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \
|
|
--output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.json" \
|
|
|
|
pip_install pandas
|
|
python check_perf_csv.py \
|
|
--actual "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \
|
|
--expected "${ARCH}_expected_ci_operator_benchmark_eager_float32_cpu.csv"
|
|
}
|
|
|
|
test_operator_microbenchmark() {
|
|
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
|
mkdir -p "$TEST_REPORTS_DIR"
|
|
TEST_DIR=$(pwd)
|
|
|
|
cd benchmarks/operator_benchmark/pt_extension
|
|
python -m pip install .
|
|
|
|
cd "${TEST_DIR}"/benchmarks/operator_benchmark
|
|
|
|
for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do
|
|
$TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
|
|
--output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
|
|
--benchmark-name "PyTorch operator microbenchmark" --use-compile
|
|
$TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
|
|
--output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}.json" \
|
|
--benchmark-name "PyTorch operator microbenchmark"
|
|
done
|
|
}
|
|
|
|
if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
|
|
(cd test && python -c "import torch; print(torch.__config__.show())")
|
|
(cd test && python -c "import torch; print(torch.__config__.parallel_info())")
|
|
fi
|
|
if [[ "${TEST_CONFIG}" == *numpy_2* ]]; then
|
|
# Install numpy-2.0.2 and compatible scipy & numba versions
|
|
# Force re-install of pandas to avoid error where pandas checks numpy version from initial install and fails upon import
|
|
TMP_PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)" 2>/dev/null)
|
|
if [ -n "$TMP_PANDAS_VERSION" ]; then
|
|
python -m pip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0 pandas=="$TMP_PANDAS_VERSION" --force-reinstall
|
|
else
|
|
python -m pip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0
|
|
fi
|
|
python test/run_test.py --include dynamo/test_functions.py dynamo/test_unspec.py test_binary_ufuncs.py test_fake_tensor.py test_linalg.py test_numpy_interop.py test_tensor_creation_ops.py test_torch.py torch_np/test_basic.py
|
|
elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]]; then
|
|
test_linux_aarch64
|
|
elif [[ "${TEST_CONFIG}" == *backward* ]]; then
|
|
test_forward_backward_compatibility
|
|
# Do NOT add tests after bc check tests, see its comment.
|
|
elif [[ "${TEST_CONFIG}" == *xla* ]]; then
|
|
install_torchvision
|
|
build_xla
|
|
test_xla
|
|
elif [[ "$TEST_CONFIG" == *vllm* ]]; then
|
|
echo "vLLM CI uses TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST"
|
|
(cd .ci/lumen_cli && python -m pip install -e .)
|
|
python -m cli.run test external vllm --test-plan "$TEST_CONFIG" --shard-id "$SHARD_NUMBER" --num-shards "$NUM_TEST_SHARDS"
|
|
elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
|
|
test_executorch
|
|
elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
|
|
test_python_legacy_jit
|
|
elif [[ "$TEST_CONFIG" == 'quantization' ]]; then
|
|
test_quantization
|
|
elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
|
|
# TODO: run some C++ tests
|
|
echo "no-op at the moment"
|
|
elif [[ "$TEST_CONFIG" == distributed ]]; then
|
|
test_distributed
|
|
# Only run RPC C++ tests on the first shard
|
|
if [[ "${SHARD_NUMBER}" == 1 ]]; then
|
|
test_rpc
|
|
fi
|
|
elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
|
|
TEST_MODE="short"
|
|
|
|
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
|
|
if [[ "${TEST_CONFIG}" == *long* ]]; then
|
|
TEST_MODE="long"
|
|
elif [[ "${TEST_CONFIG}" == *all* ]]; then
|
|
TEST_MODE="all"
|
|
fi
|
|
|
|
test_operator_benchmark cpu ${TEST_MODE}
|
|
|
|
fi
|
|
elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
|
|
test_operator_microbenchmark
|
|
elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
|
|
test_inductor_distributed
|
|
elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
|
|
test_inductor_halide
|
|
elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
|
|
test_inductor_triton_cpu
|
|
elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
|
|
test_inductor_micro_benchmark
|
|
elif [[ "${TEST_CONFIG}" == *aoti_cross_compile_for_windows* ]]; then
|
|
test_inductor_aoti_cross_compile_for_windows
|
|
elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
|
|
install_torchvision
|
|
id=$((SHARD_NUMBER-1))
|
|
test_dynamo_benchmark huggingface "$id"
|
|
elif [[ "${TEST_CONFIG}" == *timm* ]]; then
|
|
install_torchvision
|
|
id=$((SHARD_NUMBER-1))
|
|
test_dynamo_benchmark timm_models "$id"
|
|
elif [[ "${TEST_CONFIG}" == cachebench ]]; then
|
|
install_torchaudio
|
|
install_torchvision
|
|
PYTHONPATH=/torchbench test_cachebench
|
|
elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
|
|
install_torchaudio
|
|
install_torchvision
|
|
PYTHONPATH=/torchbench test_verify_cachebench
|
|
elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
|
|
install_torchaudio
|
|
install_torchvision
|
|
id=$((SHARD_NUMBER-1))
|
|
# https://github.com/opencv/opencv-python/issues/885
|
|
pip_install opencv-python==4.8.0.74
|
|
if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
|
|
PYTHONPATH=/torchbench test_inductor_torchbench_smoketest_perf
|
|
elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
|
|
PYTHONPATH=/torchbench test_inductor_torchbench_cpu_smoketest_perf
|
|
elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
|
|
TORCHBENCHPATH=/torchbench test_torchbench_gcp_smoketest
|
|
else
|
|
# Do this after checkout_install_torchbench to ensure we clobber any
|
|
# nightlies that torchbench may pull in
|
|
if [[ "${TEST_CONFIG}" != *cpu* ]]; then
|
|
install_torchrec_and_fbgemm
|
|
fi
|
|
PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
|
|
fi
|
|
elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
|
|
install_torchvision
|
|
PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
|
|
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
|
|
test_inductor_aoti
|
|
fi
|
|
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
|
|
install_torchvision
|
|
test_inductor_shard "${SHARD_NUMBER}"
|
|
elif [[ "${TEST_CONFIG}" == *einops* ]]; then
|
|
test_einops
|
|
elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
|
|
install_torchvision
|
|
test_dynamo_wrapped_shard "${SHARD_NUMBER}"
|
|
if [[ "${SHARD_NUMBER}" == 1 ]]; then
|
|
test_aten
|
|
fi
|
|
elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
|
|
install_torchvision
|
|
test_python_shard "$SHARD_NUMBER"
|
|
test_aten
|
|
elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
|
|
test_lazy_tensor_meta_reference_disabled
|
|
test_without_numpy
|
|
install_torchvision
|
|
test_python_shard 1
|
|
test_aten
|
|
test_libtorch 1
|
|
if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
|
|
test_xpu_bin
|
|
fi
|
|
elif [[ "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
|
|
install_torchvision
|
|
test_python_shard 2
|
|
test_libtorch 2
|
|
test_aot_compilation
|
|
test_custom_script_ops
|
|
test_custom_backend
|
|
test_torch_function_benchmark
|
|
elif [[ "${SHARD_NUMBER}" -gt 2 ]]; then
|
|
# Handle arbitrary number of shards
|
|
install_torchvision
|
|
test_python_shard "$SHARD_NUMBER"
|
|
elif [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then
|
|
test_vulkan
|
|
elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
|
|
test_bazel
|
|
elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
|
|
test_libtorch
|
|
elif [[ "${TEST_CONFIG}" = docs_test ]]; then
|
|
test_docs_test
|
|
elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
|
|
install_torchvision
|
|
test_python
|
|
test_aten
|
|
test_xpu_bin
|
|
elif [[ "${TEST_CONFIG}" == smoke ]]; then
|
|
test_python_smoke
|
|
elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then
|
|
test_python_smoke_b200
|
|
elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
|
|
test_h100_distributed
|
|
elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
|
|
test_h100_symm_mem
|
|
elif [[ "${TEST_CONFIG}" == "b200-symm-mem" ]]; then
|
|
test_h100_symm_mem
|
|
elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then
|
|
test_h100_cutlass_backend
|
|
else
|
|
install_torchvision
|
|
install_monkeytype
|
|
test_python
|
|
test_aten
|
|
test_vec256
|
|
test_libtorch
|
|
test_aot_compilation
|
|
test_custom_script_ops
|
|
test_custom_backend
|
|
test_torch_function_benchmark
|
|
test_benchmarks
|
|
fi
|