diff --git a/.github/workflows/xpu-compile.yml b/.github/workflows/xpu-compile.yml index c23920910..e095e089f 100644 --- a/.github/workflows/xpu-compile.yml +++ b/.github/workflows/xpu-compile.yml @@ -51,9 +51,15 @@ jobs: - name: Compile Status shell: bash run: | + echo "# torch.compile graph breaks" >> $GITHUB_STEP_SUMMARY export FI_HMEM=system ulimit -n 1048575 cd tests/torch_compile export ZE_AFFINITY_MASK=0,1 - deepspeed test_compile.py --deepspeed_config ds_config.json 2>&1 | tee log.txt - cat log.txt | grep "'graph_breaks'" | sed 's/,/ /g' | awk '{print $2}' >> $GITHUB_STEP_SUMMARY + echo "## ZeRO stage 3" >> $GITHUB_STEP_SUMMARY + deepspeed test_compile.py --deepspeed_config ds_config_z3.json 2>&1 | tee log_z3.txt + # for each line start with 'dynamo_output', extract the second field and following fields and append to GITHUB_STEP_SUMMARY using awk + cat log_z3.txt | awk '/^dynamo_output/ {$1=""; print $0}' >> $GITHUB_STEP_SUMMARY + echo "## ZeRO stage 2" >> $GITHUB_STEP_SUMMARY + deepspeed test_compile.py --deepspeed_config ds_config_z2.json 2>&1 | tee log_z2.txt + cat log_z2.txt | awk '/^dynamo_output/ {$1=""; print $0}' >> $GITHUB_STEP_SUMMARY diff --git a/tests/torch_compile/ds_config_z2.json b/tests/torch_compile/ds_config_z2.json new file mode 100644 index 000000000..30e1237c5 --- /dev/null +++ b/tests/torch_compile/ds_config_z2.json @@ -0,0 +1,40 @@ +{ + "train_batch_size": 8, + "steps_per_print": 2000, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.001, + "betas": [ + 0.8, + 0.999 + ], + "eps": 1e-8, + "weight_decay": 3e-7 + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 0.001, + "warmup_num_steps": 1000 + } + }, + "gradient_clipping": 1.0, + "prescale_gradients": false, + "bf16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 15 + }, + "wall_clock_breakdown": false, + "zero_optimization": { + "stage": 2, + "overlap_comm": false, + "contiguous_gradients": false + } +} diff --git a/tests/torch_compile/ds_config.json b/tests/torch_compile/ds_config_z3.json similarity index 100% rename from tests/torch_compile/ds_config.json rename to tests/torch_compile/ds_config_z3.json diff --git a/tests/torch_compile/test_compile.py b/tests/torch_compile/test_compile.py index 529ca56ae..adbf6eaa9 100644 --- a/tests/torch_compile/test_compile.py +++ b/tests/torch_compile/test_compile.py @@ -14,22 +14,9 @@ from torch.utils.data import Dataset, DataLoader torch._dynamo.config.cache_size_limit = 100 -import collections - def get_dynamo_stats(): - # TODO: consider deepcopy'ing the entire counters struct and - # adding a helper to do subtraction on it - return collections.Counter({ - "calls_captured": torch._dynamo.utils.counters["stats"]["calls_captured"], - "unique_graphs": torch._dynamo.utils.counters["stats"]["unique_graphs"], - "graph_breaks": sum(torch._dynamo.utils.counters["graph_break"].values()), - # NB: The plus removes zero counts - "unique_graph_breaks": len(+torch._dynamo.utils.counters["graph_break"]), - "autograd_captures": torch._dynamo.utils.counters["compiled_autograd"]["captures"], - "autograd_compiles": torch._dynamo.utils.counters["compiled_autograd"]["compiles"], - "cudagraph_skips": torch._dynamo.utils.counters["inductor"]["cudagraph_skips"], - }) + return torch._dynamo.utils.counters["graph_break"] class RandomDataset(Dataset): @@ -70,7 +57,7 @@ parser = argparse.ArgumentParser() parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher') parser.add_argument('--deepspeed_config', type=str, - default='ds_config.json', + default='ds_config_z3.json', help='path to DeepSpeed configuration file') cmd_args = parser.parse_args() @@ -82,6 +69,11 @@ residual = torch.rand(256, 256, dtype=torch.float).to(get_accelerator().current_ start_stats = get_dynamo_stats() +if comm.get_rank() == 0: + #print(dynamo_stats['graph_breaks']) + for item in start_stats.items(): + print(item) + for step, batch in enumerate(rand_loader): if step % 10 == 0 and comm.get_rank() == 0: print(f'step={step}') @@ -93,7 +85,14 @@ for step, batch in enumerate(rand_loader): model_engine.step() dynamo_stats = get_dynamo_stats() -dynamo_stats.subtract(start_stats) if comm.get_rank() == 0: - print(dynamo_stats) + # print break down of graph break stats with markdown, print in table format, start with reason, then count + # print a tag 'dynamo_output' before each line to allow post processing + print("dynamo_output | Reason | Count |") + print("dynamo_output | ------ | ----- |") + for item in dynamo_stats.items(): + # replace '|' in item[0] with a literal '|' to avoid mess with table format + item = (item[0].replace('|', r'\|'), item[1]) + print(f"dynamo_output | {item[0]} | {item[1]} |") + print(f"dynamo_output | Total | {sum(dynamo_stats.values())} |")