mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-16 07:24:54 +08:00
This is first part of the stack that does comm/compute reordering, and then uses the exposure analysis to do bucketing. Subsequent prs will handle: - use of exposure analysis to do bucketing - make sure inductor respects comm/compute overlapping done at fx level - non-profiling mm estimation/rank broadcasting of profile results Other mis: - Validate accuracy of nccl estimations ( use ruisi's profiling instead ?) For a llama 2d parallelism test, on forward, we overlap all but 2 of potentially hidden collectives. For backward, we overlap 217/269 of potentially hidden collectives. If you increase `compute_overlap_multipler` (for fudge factor of inaccurate comms estimation), that goes down to all but 16 of potentially hidden collectives. fwd example: https://gist.github.com/eellison/76209c49d8829c5f1e323d34a3f040c3 bwd example: https://gist.github.com/eellison/6cfc2285df53a94cfa4012f5fdae5c51 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163215 Approved by: https://github.com/IvanKobzarev
72 lines
3.8 KiB
Bash
Executable File
72 lines
3.8 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# Required environment variable: $BUILD_ENVIRONMENT
|
|
# (This is set by default in the Docker images we build, so you don't
|
|
# need to set it yourself.
|
|
|
|
# shellcheck source=./common.sh
|
|
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
|
|
|
|
echo "Testing pytorch"
|
|
# When adding more tests, please use HUD to see which shard is shorter
|
|
if [[ "${SHARD_NUMBER:-1}" == "1" ]]; then
|
|
# FSDP tests
|
|
for f in test/distributed/fsdp/*.py ; do time python test/run_test.py --verbose -i "${f#*/}" ; done
|
|
fi
|
|
|
|
if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then
|
|
time python test/run_test.py --include test_cuda_multigpu test_cuda_primary_ctx --verbose
|
|
|
|
# Disabling tests to see if they solve timeout issues; see https://github.com/pytorch/pytorch/issues/70015
|
|
# python tools/download_mnist.py --quiet -d test/cpp/api/mnist
|
|
# OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" build/bin/test_api
|
|
time python test/run_test.py --verbose -i distributed/test_c10d_common
|
|
time python test/run_test.py --verbose -i distributed/test_c10d_gloo
|
|
time python test/run_test.py --verbose -i distributed/test_c10d_nccl
|
|
time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
|
|
time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
|
|
time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering
|
|
time python test/run_test.py --verbose -i distributed/test_aten_comm_compute_reordering
|
|
time python test/run_test.py --verbose -i distributed/test_store
|
|
time python test/run_test.py --verbose -i distributed/test_symmetric_memory
|
|
time python test/run_test.py --verbose -i distributed/test_pg_wrapper
|
|
time python test/run_test.py --verbose -i distributed/rpc/cuda/test_tensorpipe_agent
|
|
|
|
# ShardedTensor tests
|
|
time python test/run_test.py --verbose -i distributed/checkpoint/test_checkpoint
|
|
time python test/run_test.py --verbose -i distributed/checkpoint/test_file_system_checkpoint
|
|
time python test/run_test.py --verbose -i distributed/_shard/sharding_spec/test_sharding_spec
|
|
time python test/run_test.py --verbose -i distributed/_shard/sharding_plan/test_sharding_plan
|
|
time python test/run_test.py --verbose -i distributed/_shard/sharded_tensor/test_sharded_tensor
|
|
time python test/run_test.py --verbose -i distributed/_shard/sharded_tensor/test_sharded_tensor_reshard
|
|
|
|
# functional collective tests
|
|
time python test/run_test.py --verbose -i distributed/test_functional_api
|
|
|
|
# DTensor tests
|
|
time python test/run_test.py --verbose -i distributed/tensor/test_random_ops
|
|
time python test/run_test.py --verbose -i distributed/tensor/test_dtensor_compile
|
|
time python test/run_test.py --verbose -i distributed/tensor/test_utils.py
|
|
|
|
# DeviceMesh test
|
|
time python test/run_test.py --verbose -i distributed/test_device_mesh
|
|
|
|
# DTensor/TP tests
|
|
time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_examples
|
|
time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state
|
|
|
|
# FSDP2 tests
|
|
time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh
|
|
|
|
# ND composability tests
|
|
time python test/run_test.py --verbose -i distributed/_composable/test_composability/test_2d_composability
|
|
time python test/run_test.py --verbose -i distributed/_composable/test_composability/test_pp_composability
|
|
|
|
# Other tests
|
|
time python test/run_test.py --verbose -i test_cuda_primary_ctx
|
|
time python test/run_test.py --verbose -i test_optim -- -k test_forloop_goes_right_direction_multigpu
|
|
time python test/run_test.py --verbose -i test_optim -- -k test_mixed_device_dtype
|
|
time python test/run_test.py --verbose -i test_foreach -- -k test_tensors_grouping
|
|
fi
|
|
assert_git_not_dirty
|