mirror of
https://github.com/huggingface/accelerate.git
synced 2025-10-20 10:03:46 +08:00
@ -62,12 +62,12 @@ def train_baseline(opt_level="O2"):
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||||
|
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results
|
return base_model_results, trained_model_results
|
||||||
|
|
||||||
@ -95,12 +95,12 @@ def train_integration(opt_level="O2"):
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||||
|
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results
|
return base_model_results, trained_model_results
|
||||||
|
|
||||||
@ -109,15 +109,15 @@ if __name__ == "__main__":
|
|||||||
for opt_level in ["O1", "O2"]:
|
for opt_level in ["O1", "O2"]:
|
||||||
baseline_not_trained, baseline_trained = train_baseline(opt_level)
|
baseline_not_trained, baseline_trained = train_baseline(opt_level)
|
||||||
accelerator_not_trained, accelerator_trained = train_integration(opt_level)
|
accelerator_not_trained, accelerator_trained = train_integration(opt_level)
|
||||||
assert (
|
assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
|
||||||
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
|
f"Accuracy not the same for untrained baseline and accelerator using opt_level={opt_level}: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
|
||||||
), f'Accuracy not the same for untrained baseline and accelerator using opt_level={opt_level}: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
|
||||||
baseline_not_trained["f1"] == accelerator_not_trained["f1"]
|
f"F1 not the same for untrained baseline and accelerator using opt_level={opt_level}: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
|
||||||
), f'F1 not the same for untrained baseline and accelerator using opt_level={opt_level}: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
|
||||||
baseline_trained["accuracy"] == accelerator_trained["accuracy"]
|
f"Accuracy not the same for trained baseline and accelerator using opt_level={opt_level}: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
|
||||||
), f'Accuracy not the same for trained baseline and accelerator using opt_level={opt_level}: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["f1"] == accelerator_trained["f1"], (
|
||||||
baseline_trained["f1"] == accelerator_trained["f1"]
|
f"F1 not the same for trained baseline and accelerator using opt_level={opt_level}: {baseline_trained['f1']} == {accelerator_trained['f1']}"
|
||||||
), f'F1 not the same for trained baseline and accelerator using opt_level={opt_level}: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
|
)
|
||||||
|
@ -90,12 +90,12 @@ def train_baseline(zero_stage: int = 1, opt_level: str = "O1"):
|
|||||||
model.destroy()
|
model.destroy()
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
AcceleratorState()._reset_state(True)
|
AcceleratorState()._reset_state(True)
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results
|
return base_model_results, trained_model_results
|
||||||
|
|
||||||
@ -129,12 +129,12 @@ def train_integration(zero_stage: int = 1, opt_level: str = "O1"):
|
|||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||||
model.destroy()
|
model.destroy()
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
AcceleratorState()._reset_state(True)
|
AcceleratorState()._reset_state(True)
|
||||||
return base_model_results, trained_model_results
|
return base_model_results, trained_model_results
|
||||||
@ -145,17 +145,17 @@ if __name__ == "__main__":
|
|||||||
for opt_level in ["O1", "O2", "O3"]:
|
for opt_level in ["O1", "O2", "O3"]:
|
||||||
baseline_not_trained, baseline_trained = train_baseline(zero_stage, opt_level)
|
baseline_not_trained, baseline_trained = train_baseline(zero_stage, opt_level)
|
||||||
accelerator_not_trained, accelerator_trained = train_integration(zero_stage, opt_level)
|
accelerator_not_trained, accelerator_trained = train_integration(zero_stage, opt_level)
|
||||||
assert (
|
assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
|
||||||
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
|
f"ZERO stage {zero_stage}, opt_level={opt_level}:\nAccuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
|
||||||
), f'ZERO stage {zero_stage}, opt_level={opt_level}:\nAccuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
|
||||||
baseline_not_trained["f1"] == accelerator_not_trained["f1"]
|
f"ZERO stage {zero_stage}, opt_level={opt_level}:\nF1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
|
||||||
), f'ZERO stage {zero_stage}, opt_level={opt_level}:\nF1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
|
||||||
baseline_trained["accuracy"] == accelerator_trained["accuracy"]
|
f"ZERO stage {zero_stage}, opt_level={opt_level}:\nAccuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
|
||||||
), f'ZERO stage {zero_stage}, opt_level={opt_level}:\nAccuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["f1"] == accelerator_trained["f1"], (
|
||||||
baseline_trained["f1"] == accelerator_trained["f1"]
|
f"ZERO stage {zero_stage}, opt_level={opt_level}:\nF1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
|
||||||
), f'ZERO stage {zero_stage}, opt_level={opt_level}:\nF1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
|
)
|
||||||
|
|
||||||
torch.distributed.destroy_process_group()
|
torch.distributed.destroy_process_group()
|
||||||
|
@ -56,12 +56,12 @@ def train_baseline(opt_level="O2"):
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
|
||||||
|
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results
|
return base_model_results, trained_model_results
|
||||||
|
|
||||||
@ -89,12 +89,12 @@ def train_integration(opt_level="O2"):
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
|
||||||
|
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results
|
return base_model_results, trained_model_results
|
||||||
|
|
||||||
@ -104,15 +104,15 @@ if __name__ == "__main__":
|
|||||||
baseline_not_trained, baseline_trained = train_baseline(opt_level)
|
baseline_not_trained, baseline_trained = train_baseline(opt_level)
|
||||||
accelerator_not_trained, accelerator_trained = train_integration(opt_level)
|
accelerator_not_trained, accelerator_trained = train_integration(opt_level)
|
||||||
|
|
||||||
assert (
|
assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
|
||||||
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
|
f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
|
||||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
|
||||||
baseline_not_trained["f1"] == accelerator_not_trained["f1"]
|
f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
|
||||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
|
||||||
baseline_trained["accuracy"] == accelerator_trained["accuracy"]
|
f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
|
||||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["f1"] == accelerator_trained["f1"], (
|
||||||
baseline_trained["f1"] == accelerator_trained["f1"]
|
f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
|
||||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
|
)
|
||||||
|
@ -96,12 +96,12 @@ def train_baseline():
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||||
|
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results
|
return base_model_results, trained_model_results
|
||||||
|
|
||||||
@ -128,12 +128,12 @@ def train_integration():
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||||
|
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results
|
return base_model_results, trained_model_results
|
||||||
|
|
||||||
@ -142,17 +142,17 @@ if __name__ == "__main__":
|
|||||||
baseline_not_trained, baseline_trained = train_baseline()
|
baseline_not_trained, baseline_trained = train_baseline()
|
||||||
accelerator_not_trained, accelerator_trained = train_integration()
|
accelerator_not_trained, accelerator_trained = train_integration()
|
||||||
|
|
||||||
assert (
|
assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
|
||||||
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
|
f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
|
||||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
|
||||||
baseline_not_trained["f1"] == accelerator_not_trained["f1"]
|
f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
|
||||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
|
||||||
baseline_trained["accuracy"] == accelerator_trained["accuracy"]
|
f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
|
||||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["f1"] == accelerator_trained["f1"], (
|
||||||
baseline_trained["f1"] == accelerator_trained["f1"]
|
f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
|
||||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
|
)
|
||||||
|
|
||||||
torch.distributed.destroy_process_group()
|
torch.distributed.destroy_process_group()
|
||||||
|
@ -126,12 +126,12 @@ def train_baseline(zero_stage: int = 1):
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||||
model.destroy()
|
model.destroy()
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
del config
|
del config
|
||||||
return base_model_results, trained_model_results, model_outputs, data
|
return base_model_results, trained_model_results, model_outputs, data
|
||||||
@ -180,12 +180,12 @@ def train_integration(zero_stage: int = 1):
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||||
model.destroy()
|
model.destroy()
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
del config
|
del config
|
||||||
return base_model_results, trained_model_results, model_outputs, data
|
return base_model_results, trained_model_results, model_outputs, data
|
||||||
@ -197,17 +197,17 @@ if __name__ == "__main__":
|
|||||||
accelerator_not_trained, accelerator_trained, accelerator_outputs, accelerator_data = train_integration(
|
accelerator_not_trained, accelerator_trained, accelerator_outputs, accelerator_data = train_integration(
|
||||||
zero_stage
|
zero_stage
|
||||||
)
|
)
|
||||||
assert (
|
assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
|
||||||
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
|
f"ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
|
||||||
), f'ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
|
||||||
baseline_not_trained["f1"] == accelerator_not_trained["f1"]
|
f"ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
|
||||||
), f'ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
|
||||||
baseline_trained["accuracy"] == accelerator_trained["accuracy"]
|
f"ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
|
||||||
), f'ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["f1"] == accelerator_trained["f1"], (
|
||||||
baseline_trained["f1"] == accelerator_trained["f1"]
|
f"ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
|
||||||
), f'ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
|
)
|
||||||
AcceleratorState()._reset_state(True)
|
AcceleratorState()._reset_state(True)
|
||||||
torch.distributed.destroy_process_group()
|
torch.distributed.destroy_process_group()
|
||||||
|
@ -106,12 +106,12 @@ def train_baseline():
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||||
|
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results
|
return base_model_results, trained_model_results
|
||||||
|
|
||||||
@ -143,12 +143,12 @@ def train_integration():
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||||
|
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results
|
return base_model_results, trained_model_results
|
||||||
|
|
||||||
@ -157,17 +157,17 @@ if __name__ == "__main__":
|
|||||||
baseline_not_trained, baseline_trained = train_baseline()
|
baseline_not_trained, baseline_trained = train_baseline()
|
||||||
accelerator_not_trained, accelerator_trained = train_integration()
|
accelerator_not_trained, accelerator_trained = train_integration()
|
||||||
|
|
||||||
assert (
|
assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
|
||||||
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
|
f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
|
||||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
|
||||||
baseline_not_trained["f1"] == accelerator_not_trained["f1"]
|
f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
|
||||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
|
||||||
baseline_trained["accuracy"] == accelerator_trained["accuracy"]
|
f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
|
||||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["f1"] == accelerator_trained["f1"], (
|
||||||
baseline_trained["f1"] == accelerator_trained["f1"]
|
f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
|
||||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
|
)
|
||||||
|
|
||||||
torch.distributed.destroy_process_group()
|
torch.distributed.destroy_process_group()
|
||||||
|
@ -87,12 +87,12 @@ def train_baseline():
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
|
||||||
|
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results
|
return base_model_results, trained_model_results
|
||||||
|
|
||||||
@ -117,12 +117,12 @@ def train_integration():
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
|
||||||
|
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results
|
return base_model_results, trained_model_results
|
||||||
|
|
||||||
@ -131,15 +131,15 @@ if __name__ == "__main__":
|
|||||||
baseline_not_trained, baseline_trained = train_baseline()
|
baseline_not_trained, baseline_trained = train_baseline()
|
||||||
AcceleratorState._reset_state(True)
|
AcceleratorState._reset_state(True)
|
||||||
accelerator_not_trained, accelerator_trained = train_integration()
|
accelerator_not_trained, accelerator_trained = train_integration()
|
||||||
assert (
|
assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
|
||||||
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
|
f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
|
||||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
|
||||||
baseline_not_trained["f1"] == accelerator_not_trained["f1"]
|
f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
|
||||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
|
||||||
baseline_trained["accuracy"] == accelerator_trained["accuracy"]
|
f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
|
||||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["f1"] == accelerator_trained["f1"], (
|
||||||
baseline_trained["f1"] == accelerator_trained["f1"]
|
f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
|
||||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
|
)
|
||||||
|
@ -79,12 +79,12 @@ def train_baseline():
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||||
|
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results
|
return base_model_results, trained_model_results
|
||||||
|
|
||||||
@ -114,12 +114,12 @@ def train_integration():
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||||
|
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results
|
return base_model_results, trained_model_results
|
||||||
|
|
||||||
@ -128,17 +128,17 @@ if __name__ == "__main__":
|
|||||||
baseline_not_trained, baseline_trained = train_baseline()
|
baseline_not_trained, baseline_trained = train_baseline()
|
||||||
accelerator_not_trained, accelerator_trained = train_integration()
|
accelerator_not_trained, accelerator_trained = train_integration()
|
||||||
|
|
||||||
assert (
|
assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
|
||||||
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
|
f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
|
||||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
|
||||||
baseline_not_trained["f1"] == accelerator_not_trained["f1"]
|
f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
|
||||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
|
||||||
baseline_trained["accuracy"] == accelerator_trained["accuracy"]
|
f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
|
||||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["f1"] == accelerator_trained["f1"], (
|
||||||
baseline_trained["f1"] == accelerator_trained["f1"]
|
f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
|
||||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
|
)
|
||||||
|
|
||||||
torch.distributed.destroy_process_group()
|
torch.distributed.destroy_process_group()
|
||||||
|
@ -113,12 +113,12 @@ def train_baseline(zero_stage: int = 1):
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||||
model.destroy()
|
model.destroy()
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results, model_outputs, data
|
return base_model_results, trained_model_results, model_outputs, data
|
||||||
|
|
||||||
@ -159,12 +159,12 @@ def train_integration(zero_stage: int = 1):
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||||
model.destroy()
|
model.destroy()
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results, model_outputs, data
|
return base_model_results, trained_model_results, model_outputs, data
|
||||||
|
|
||||||
@ -175,17 +175,17 @@ if __name__ == "__main__":
|
|||||||
accelerator_not_trained, accelerator_trained, accelerator_outputs, accelerator_data = train_integration(
|
accelerator_not_trained, accelerator_trained, accelerator_outputs, accelerator_data = train_integration(
|
||||||
zero_stage
|
zero_stage
|
||||||
)
|
)
|
||||||
assert (
|
assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
|
||||||
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
|
f"ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
|
||||||
), f'ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
|
||||||
baseline_not_trained["f1"] == accelerator_not_trained["f1"]
|
f"ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
|
||||||
), f'ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
|
||||||
baseline_trained["accuracy"] == accelerator_trained["accuracy"]
|
f"ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
|
||||||
), f'ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["f1"] == accelerator_trained["f1"], (
|
||||||
baseline_trained["f1"] == accelerator_trained["f1"]
|
f"ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
|
||||||
), f'ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
|
)
|
||||||
|
|
||||||
torch.distributed.destroy_process_group()
|
torch.distributed.destroy_process_group()
|
||||||
|
@ -91,12 +91,12 @@ def train_baseline():
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||||
|
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results
|
return base_model_results, trained_model_results
|
||||||
|
|
||||||
@ -131,12 +131,12 @@ def train_integration():
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
|
||||||
|
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results
|
return base_model_results, trained_model_results
|
||||||
|
|
||||||
@ -145,17 +145,17 @@ if __name__ == "__main__":
|
|||||||
baseline_not_trained, baseline_trained = train_baseline()
|
baseline_not_trained, baseline_trained = train_baseline()
|
||||||
accelerator_not_trained, accelerator_trained = train_integration()
|
accelerator_not_trained, accelerator_trained = train_integration()
|
||||||
|
|
||||||
assert (
|
assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
|
||||||
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
|
f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
|
||||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
|
||||||
baseline_not_trained["f1"] == accelerator_not_trained["f1"]
|
f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
|
||||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
|
||||||
baseline_trained["accuracy"] == accelerator_trained["accuracy"]
|
f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
|
||||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["f1"] == accelerator_trained["f1"], (
|
||||||
baseline_trained["f1"] == accelerator_trained["f1"]
|
f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
|
||||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
|
)
|
||||||
|
|
||||||
torch.distributed.destroy_process_group()
|
torch.distributed.destroy_process_group()
|
||||||
|
@ -70,12 +70,12 @@ def train_baseline():
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
|
||||||
|
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results
|
return base_model_results, trained_model_results
|
||||||
|
|
||||||
@ -104,12 +104,12 @@ def train_integration():
|
|||||||
|
|
||||||
trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
|
trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
|
||||||
|
|
||||||
assert (
|
assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
|
||||||
trained_model_results["accuracy"] > base_model_results["accuracy"]
|
f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
|
||||||
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
|
)
|
||||||
assert (
|
assert trained_model_results["f1"] > base_model_results["f1"], (
|
||||||
trained_model_results["f1"] > base_model_results["f1"]
|
f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
|
||||||
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
|
)
|
||||||
|
|
||||||
return base_model_results, trained_model_results
|
return base_model_results, trained_model_results
|
||||||
|
|
||||||
@ -118,15 +118,15 @@ if __name__ == "__main__":
|
|||||||
baseline_not_trained, baseline_trained = train_baseline()
|
baseline_not_trained, baseline_trained = train_baseline()
|
||||||
accelerator_not_trained, accelerator_trained = train_integration()
|
accelerator_not_trained, accelerator_trained = train_integration()
|
||||||
|
|
||||||
assert (
|
assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
|
||||||
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
|
f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
|
||||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
|
||||||
baseline_not_trained["f1"] == accelerator_not_trained["f1"]
|
f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
|
||||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
|
||||||
baseline_trained["accuracy"] == accelerator_trained["accuracy"]
|
f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
|
||||||
), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
|
)
|
||||||
assert (
|
assert baseline_trained["f1"] == accelerator_trained["f1"], (
|
||||||
baseline_trained["f1"] == accelerator_trained["f1"]
|
f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
|
||||||
), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
|
)
|
||||||
|
@ -59,7 +59,7 @@ def evaluate(args, config: dict, init_fn: Callable, run_name: str) -> torch.Tens
|
|||||||
Loss: {loss[-1].item()}
|
Loss: {loss[-1].item()}
|
||||||
Peak Allocated Memory: {float(memory_tracker.peak_allocated_memory):.2f} MB
|
Peak Allocated Memory: {float(memory_tracker.peak_allocated_memory):.2f} MB
|
||||||
Peak Reserved Memory: {float(memory_tracker.peak_reserved_memory):.2f} MB
|
Peak Reserved Memory: {float(memory_tracker.peak_reserved_memory):.2f} MB
|
||||||
{'-' * 34}"""
|
{"-" * 34}"""
|
||||||
accelerator.print(msg)
|
accelerator.print(msg)
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
|
@ -611,7 +611,7 @@ def main():
|
|||||||
|
|
||||||
if isinstance(checkpointing_steps, int):
|
if isinstance(checkpointing_steps, int):
|
||||||
if completed_steps % checkpointing_steps == 0:
|
if completed_steps % checkpointing_steps == 0:
|
||||||
output_dir = f"step_{completed_steps }"
|
output_dir = f"step_{completed_steps}"
|
||||||
if args.output_dir is not None:
|
if args.output_dir is not None:
|
||||||
output_dir = os.path.join(args.output_dir, output_dir)
|
output_dir = os.path.join(args.output_dir, output_dir)
|
||||||
accelerator.save_state(output_dir)
|
accelerator.save_state(output_dir)
|
||||||
|
2
setup.py
2
setup.py
@ -19,7 +19,7 @@ extras = {}
|
|||||||
extras["quality"] = [
|
extras["quality"] = [
|
||||||
"black ~= 23.1", # hf-doc-builder has a hidden dependency on `black`
|
"black ~= 23.1", # hf-doc-builder has a hidden dependency on `black`
|
||||||
"hf-doc-builder >= 0.3.0",
|
"hf-doc-builder >= 0.3.0",
|
||||||
"ruff ~= 0.6.4",
|
"ruff ~= 0.11.2",
|
||||||
]
|
]
|
||||||
extras["docs"] = []
|
extras["docs"] = []
|
||||||
extras["test_prod"] = ["pytest>=7.2.0,<=8.0.0", "pytest-xdist", "pytest-subtests", "parameterized", "pytest-order"]
|
extras["test_prod"] = ["pytest>=7.2.0,<=8.0.0", "pytest-xdist", "pytest-subtests", "parameterized", "pytest-order"]
|
||||||
|
@ -445,9 +445,9 @@ class Accelerator:
|
|||||||
self.has_fp8_handler = False
|
self.has_fp8_handler = False
|
||||||
if kwargs_handlers is not None:
|
if kwargs_handlers is not None:
|
||||||
for handler in kwargs_handlers:
|
for handler in kwargs_handlers:
|
||||||
assert isinstance(
|
assert isinstance(handler, KwargsHandler), (
|
||||||
handler, KwargsHandler
|
f"Unsupported kwargs handler passed: {handler}, must be one that inherits `accelerate.utils.KwargsHandler`."
|
||||||
), f"Unsupported kwargs handler passed: {handler}, must be one that inherits `accelerate.utils.KwargsHandler`."
|
)
|
||||||
# Add the handler class to the set of found handlers
|
# Add the handler class to the set of found handlers
|
||||||
if handler.__class__ in found_handlers:
|
if handler.__class__ in found_handlers:
|
||||||
raise ValueError(f"You can only pass one {handler.__class__} in `kwargs_handlers`.")
|
raise ValueError(f"You can only pass one {handler.__class__} in `kwargs_handlers`.")
|
||||||
|
@ -228,9 +228,9 @@ def get_cluster_input():
|
|||||||
)
|
)
|
||||||
if use_deepspeed:
|
if use_deepspeed:
|
||||||
distributed_type = DistributedType.DEEPSPEED
|
distributed_type = DistributedType.DEEPSPEED
|
||||||
assert (
|
assert is_deepspeed_available(), (
|
||||||
is_deepspeed_available()
|
"DeepSpeed is not installed => run `pip3 install deepspeed` or build it from source"
|
||||||
), "DeepSpeed is not installed => run `pip3 install deepspeed` or build it from source"
|
)
|
||||||
|
|
||||||
if distributed_type == DistributedType.DEEPSPEED:
|
if distributed_type == DistributedType.DEEPSPEED:
|
||||||
use_deepspeed_config = _ask_field(
|
use_deepspeed_config = _ask_field(
|
||||||
|
@ -184,12 +184,12 @@ def training_function(config, args):
|
|||||||
with open(os.path.join(args.output_dir, f"state_{starting_epoch - 1}.json")) as f:
|
with open(os.path.join(args.output_dir, f"state_{starting_epoch - 1}.json")) as f:
|
||||||
resumed_state = json.load(f)
|
resumed_state = json.load(f)
|
||||||
assert resumed_state["accuracy"] == accuracy, "Accuracy mismatch, loading from checkpoint failed"
|
assert resumed_state["accuracy"] == accuracy, "Accuracy mismatch, loading from checkpoint failed"
|
||||||
assert (
|
assert resumed_state["lr"] == lr_scheduler.get_lr()[0], (
|
||||||
resumed_state["lr"] == lr_scheduler.get_lr()[0]
|
"Scheduler learning rate mismatch, loading from checkpoint failed"
|
||||||
), "Scheduler learning rate mismatch, loading from checkpoint failed"
|
)
|
||||||
assert (
|
assert resumed_state["optimizer_lr"] == optimizer.param_groups[0]["lr"], (
|
||||||
resumed_state["optimizer_lr"] == optimizer.param_groups[0]["lr"]
|
"Optimizer learning rate mismatch, loading from checkpoint failed"
|
||||||
), "Optimizer learning rate mismatch, loading from checkpoint failed"
|
)
|
||||||
assert resumed_state["epoch"] == starting_epoch - 1, "Epoch mismatch, loading from checkpoint failed"
|
assert resumed_state["epoch"] == starting_epoch - 1, "Epoch mismatch, loading from checkpoint failed"
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -115,9 +115,9 @@ def test_torch_metrics(
|
|||||||
):
|
):
|
||||||
_, ddp_model, dataloader = get_basic_setup(accelerator, num_samples, batch_size)
|
_, ddp_model, dataloader = get_basic_setup(accelerator, num_samples, batch_size)
|
||||||
logits, _ = generate_predictions(ddp_model, dataloader, accelerator)
|
logits, _ = generate_predictions(ddp_model, dataloader, accelerator)
|
||||||
assert (
|
assert len(logits) == num_samples, (
|
||||||
len(logits) == num_samples
|
f"Unexpected number of inputs:\n Expected: {num_samples}\n Actual: {len(logits)}"
|
||||||
), f"Unexpected number of inputs:\n Expected: {num_samples}\n Actual: {len(logits)}"
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_mrpc(dispatch_batches: bool = False, split_batches: bool = False):
|
def test_mrpc(dispatch_batches: bool = False, split_batches: bool = False):
|
||||||
@ -148,9 +148,9 @@ def test_mrpc(dispatch_batches: bool = False, split_batches: bool = False):
|
|||||||
distributed = metric.compute()
|
distributed = metric.compute()
|
||||||
|
|
||||||
for key in "accuracy f1".split():
|
for key in "accuracy f1".split():
|
||||||
assert math.isclose(
|
assert math.isclose(baseline[key], distributed[key]), (
|
||||||
baseline[key], distributed[key]
|
f"Baseline and Distributed are not the same for key {key}:\n\tBaseline: {baseline[key]}\n\tDistributed: {distributed[key]}\n"
|
||||||
), f"Baseline and Distributed are not the same for key {key}:\n\tBaseline: {baseline[key]}\n\tDistributed: {distributed[key]}\n"
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_gather_for_metrics_with_non_tensor_objects_iterable_dataset():
|
def test_gather_for_metrics_with_non_tensor_objects_iterable_dataset():
|
||||||
@ -235,9 +235,9 @@ def test_gather_for_metrics_drop_last():
|
|||||||
|
|
||||||
# Should return a full set of complete batches from each GPU
|
# Should return a full set of complete batches from each GPU
|
||||||
num_expected_items = per_device_batch_size * accelerator.num_processes
|
num_expected_items = per_device_batch_size * accelerator.num_processes
|
||||||
assert gathered_items.size(0) == (
|
assert gathered_items.size(0) == (num_expected_items), (
|
||||||
num_expected_items
|
f"Expected number of items: {num_expected_items}, Actual: {gathered_items.size(0)}"
|
||||||
), f"Expected number of items: {num_expected_items}, Actual: {gathered_items.size(0)}"
|
)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -255,9 +255,9 @@ def training_function(config, args):
|
|||||||
)
|
)
|
||||||
train_total_peak_memory[f"epoch-{epoch}"] = tracemalloc.peaked + b2mb(tracemalloc.begin)
|
train_total_peak_memory[f"epoch-{epoch}"] = tracemalloc.peaked + b2mb(tracemalloc.begin)
|
||||||
if args.peak_memory_upper_bound is not None:
|
if args.peak_memory_upper_bound is not None:
|
||||||
assert (
|
assert train_total_peak_memory[f"epoch-{epoch}"] <= args.peak_memory_upper_bound, (
|
||||||
train_total_peak_memory[f"epoch-{epoch}"] <= args.peak_memory_upper_bound
|
"Peak memory usage exceeded the upper bound"
|
||||||
), "Peak memory usage exceeded the upper bound"
|
)
|
||||||
|
|
||||||
accelerator.wait_for_everyone()
|
accelerator.wait_for_everyone()
|
||||||
if accelerator.is_main_process:
|
if accelerator.is_main_process:
|
||||||
|
@ -161,9 +161,9 @@ def training_function(config, args):
|
|||||||
and linear_decay_scheduler
|
and linear_decay_scheduler
|
||||||
and accelerator.state.mixed_precision == "no"
|
and accelerator.state.mixed_precision == "no"
|
||||||
):
|
):
|
||||||
assert (
|
assert lr_scheduler.get_last_lr()[0] == expected_lr_after_first_optim_step, (
|
||||||
lr_scheduler.get_last_lr()[0] == expected_lr_after_first_optim_step
|
f"Wrong lr found at second step, expected {expected_lr_after_first_optim_step}, got {lr_scheduler.get_last_lr()[0]}"
|
||||||
), f"Wrong lr found at second step, expected {expected_lr_after_first_optim_step}, got {lr_scheduler.get_last_lr()[0]}"
|
)
|
||||||
lr_scheduler_check_completed = True
|
lr_scheduler_check_completed = True
|
||||||
|
|
||||||
model.eval()
|
model.eval()
|
||||||
@ -199,14 +199,14 @@ def training_function(config, args):
|
|||||||
|
|
||||||
# check that the LR is 0
|
# check that the LR is 0
|
||||||
if linear_decay_scheduler and accelerator.state.mixed_precision == "no":
|
if linear_decay_scheduler and accelerator.state.mixed_precision == "no":
|
||||||
assert (
|
assert lr_scheduler.get_last_lr()[0] == 0, (
|
||||||
lr_scheduler.get_last_lr()[0] == 0
|
f"Wrong lr found at last step, expected 0, got {lr_scheduler.get_last_lr()[0]}"
|
||||||
), f"Wrong lr found at last step, expected 0, got {lr_scheduler.get_last_lr()[0]}"
|
)
|
||||||
|
|
||||||
if args.performance_lower_bound is not None:
|
if args.performance_lower_bound is not None:
|
||||||
assert (
|
assert args.performance_lower_bound <= best_performance, (
|
||||||
args.performance_lower_bound <= best_performance
|
f"Best performance metric {best_performance} is lower than the lower bound {args.performance_lower_bound}"
|
||||||
), f"Best performance metric {best_performance} is lower than the lower bound {args.performance_lower_bound}"
|
)
|
||||||
|
|
||||||
accelerator.wait_for_everyone()
|
accelerator.wait_for_everyone()
|
||||||
if accelerator.is_main_process:
|
if accelerator.is_main_process:
|
||||||
@ -216,9 +216,9 @@ def training_function(config, args):
|
|||||||
# Finally try saving the model
|
# Finally try saving the model
|
||||||
accelerator.save_model(model, args.output_dir)
|
accelerator.save_model(model, args.output_dir)
|
||||||
accelerator.wait_for_everyone()
|
accelerator.wait_for_everyone()
|
||||||
assert Path(
|
assert Path(args.output_dir, SAFE_WEIGHTS_NAME).exists(), (
|
||||||
args.output_dir, SAFE_WEIGHTS_NAME
|
"Model was not saved when calling `Accelerator.save_model`"
|
||||||
).exists(), "Model was not saved when calling `Accelerator.save_model`"
|
)
|
||||||
accelerator.end_training()
|
accelerator.end_training()
|
||||||
|
|
||||||
|
|
||||||
|
@ -270,9 +270,9 @@ def test_data_loader(data_loader, accelerator):
|
|||||||
sorted_all_examples = sorted(all_examples)
|
sorted_all_examples = sorted(all_examples)
|
||||||
|
|
||||||
# Check if all elements are present in the sorted list of iterated samples
|
# Check if all elements are present in the sorted list of iterated samples
|
||||||
assert (
|
assert len(set(sorted_all_examples)) == NUM_ELEMENTS, (
|
||||||
len(set(sorted_all_examples)) == NUM_ELEMENTS
|
"Not all the dataset elements have been iterated in an epoch due to duplication of samples across processes."
|
||||||
), "Not all the dataset elements have been iterated in an epoch due to duplication of samples across processes."
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_stateful_dataloader(accelerator):
|
def test_stateful_dataloader(accelerator):
|
||||||
|
@ -112,9 +112,9 @@ def process_execution_check():
|
|||||||
assert text.startswith("Currently in the main process\n"), "Main process was not first"
|
assert text.startswith("Currently in the main process\n"), "Main process was not first"
|
||||||
if num_processes > 1:
|
if num_processes > 1:
|
||||||
assert text.endswith("Now on another process\n"), "Main process was not first"
|
assert text.endswith("Now on another process\n"), "Main process was not first"
|
||||||
assert (
|
assert text.count("Now on another process\n") == accelerator.num_processes - 1, (
|
||||||
text.count("Now on another process\n") == accelerator.num_processes - 1
|
f"Only wrote to file {text.count('Now on another process') + 1} times, not {accelerator.num_processes}"
|
||||||
), f"Only wrote to file {text.count('Now on another process') + 1} times, not {accelerator.num_processes}"
|
)
|
||||||
except AssertionError:
|
except AssertionError:
|
||||||
path.unlink()
|
path.unlink()
|
||||||
raise
|
raise
|
||||||
@ -351,13 +351,13 @@ def custom_sampler_check():
|
|||||||
dl = prepare_data_loader(dl, state.device, state.num_processes, state.process_index)
|
dl = prepare_data_loader(dl, state.device, state.num_processes, state.process_index)
|
||||||
# We need just ensure that `dl.batch_sampler` (or `dl.batch_sampler.batch_sampler` is indeed the old batch sampler
|
# We need just ensure that `dl.batch_sampler` (or `dl.batch_sampler.batch_sampler` is indeed the old batch sampler
|
||||||
if hasattr(dl.batch_sampler, "batch_sampler"):
|
if hasattr(dl.batch_sampler, "batch_sampler"):
|
||||||
assert isinstance(
|
assert isinstance(dl.batch_sampler.batch_sampler, CustomBatchSampler), (
|
||||||
dl.batch_sampler.batch_sampler, CustomBatchSampler
|
"Custom sampler was changed after calling `prepare_data_loader`"
|
||||||
), "Custom sampler was changed after calling `prepare_data_loader`"
|
)
|
||||||
else:
|
else:
|
||||||
assert isinstance(
|
assert isinstance(dl.batch_sampler, CustomBatchSampler), (
|
||||||
dl.batch_sampler, CustomBatchSampler
|
"Custom sampler was changed after calling `prepare_data_loader`"
|
||||||
), "Custom sampler was changed after calling `prepare_data_loader`"
|
)
|
||||||
|
|
||||||
|
|
||||||
def check_seedable_sampler():
|
def check_seedable_sampler():
|
||||||
@ -400,9 +400,9 @@ def check_seedable_sampler_in_batch_sampler_shard():
|
|||||||
)
|
)
|
||||||
|
|
||||||
target_sampler = prepared_data_loader.batch_sampler.batch_sampler.sampler
|
target_sampler = prepared_data_loader.batch_sampler.batch_sampler.sampler
|
||||||
assert isinstance(
|
assert isinstance(target_sampler, SeedableRandomSampler), (
|
||||||
target_sampler, SeedableRandomSampler
|
"Sampler in BatchSamplerShard is not SeedableRandomSampler."
|
||||||
), "Sampler in BatchSamplerShard is not SeedableRandomSampler."
|
)
|
||||||
|
|
||||||
|
|
||||||
def check_seedable_sampler_with_data_seed():
|
def check_seedable_sampler_with_data_seed():
|
||||||
@ -666,31 +666,31 @@ def test_split_between_processes_dataset(datasets_Dataset):
|
|||||||
state = AcceleratorState()
|
state = AcceleratorState()
|
||||||
data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes)])
|
data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes)])
|
||||||
with state.split_between_processes(data, apply_padding=False) as results:
|
with state.split_between_processes(data, apply_padding=False) as results:
|
||||||
assert (
|
assert len(results) == 2, (
|
||||||
len(results) == 2
|
f"Each process did not have two items. Process index: {state.process_index}; Length: {len(results)}"
|
||||||
), f"Each process did not have two items. Process index: {state.process_index}; Length: {len(results)}"
|
)
|
||||||
|
|
||||||
data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes - 1)])
|
data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes - 1)])
|
||||||
with state.split_between_processes(data, apply_padding=False) as results:
|
with state.split_between_processes(data, apply_padding=False) as results:
|
||||||
if state.is_last_process:
|
if state.is_last_process:
|
||||||
assert (
|
assert len(results) == 1, (
|
||||||
len(results) == 1
|
f"Last process did not receive a single item. Process index: {state.process_index}; Length: {len(results)}"
|
||||||
), f"Last process did not receive a single item. Process index: {state.process_index}; Length: {len(results)}"
|
)
|
||||||
else:
|
else:
|
||||||
assert (
|
assert len(results) == 2, (
|
||||||
len(results) == 2
|
f"One of the intermediate processes did not receive two items. Process index: {state.process_index}; Length: {len(results)}"
|
||||||
), f"One of the intermediate processes did not receive two items. Process index: {state.process_index}; Length: {len(results)}"
|
)
|
||||||
|
|
||||||
data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes - 1)])
|
data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes - 1)])
|
||||||
with state.split_between_processes(data, apply_padding=True) as results:
|
with state.split_between_processes(data, apply_padding=True) as results:
|
||||||
if state.num_processes == 1:
|
if state.num_processes == 1:
|
||||||
assert (
|
assert len(results) == 1, (
|
||||||
len(results) == 1
|
f"Single process did not receive a single item. Process index: {state.process_index}; Length: {len(results)}"
|
||||||
), f"Single process did not receive a single item. Process index: {state.process_index}; Length: {len(results)}"
|
)
|
||||||
else:
|
else:
|
||||||
assert (
|
assert len(results) == 2, (
|
||||||
len(results) == 2
|
f"Each process did not have two items. Process index: {state.process_index}; Length: {len(results)}"
|
||||||
), f"Each process did not have two items. Process index: {state.process_index}; Length: {len(results)}"
|
)
|
||||||
|
|
||||||
state.wait_for_everyone()
|
state.wait_for_everyone()
|
||||||
|
|
||||||
@ -699,18 +699,18 @@ def test_split_between_processes_list():
|
|||||||
state = AcceleratorState()
|
state = AcceleratorState()
|
||||||
data = list(range(0, 2 * state.num_processes))
|
data = list(range(0, 2 * state.num_processes))
|
||||||
with state.split_between_processes(data) as results:
|
with state.split_between_processes(data) as results:
|
||||||
assert (
|
assert len(results) == 2, (
|
||||||
len(results) == 2
|
f"Each process did not have two items. Process index: {state.process_index}; Length: {len(results)}"
|
||||||
), f"Each process did not have two items. Process index: {state.process_index}; Length: {len(results)}"
|
)
|
||||||
|
|
||||||
data = list(range(0, (3 * state.num_processes) - 1))
|
data = list(range(0, (3 * state.num_processes) - 1))
|
||||||
with state.split_between_processes(data, apply_padding=True) as results:
|
with state.split_between_processes(data, apply_padding=True) as results:
|
||||||
if state.is_last_process:
|
if state.is_last_process:
|
||||||
# Test that the last process gets the extra item(s)
|
# Test that the last process gets the extra item(s)
|
||||||
num_samples_per_device = math.ceil(len(data) / state.num_processes)
|
num_samples_per_device = math.ceil(len(data) / state.num_processes)
|
||||||
assert (
|
assert len(results) == num_samples_per_device, (
|
||||||
len(results) == num_samples_per_device
|
f"Last process did not get the extra item(s). Process index: {state.process_index}; Length: {len(results)}"
|
||||||
), f"Last process did not get the extra item(s). Process index: {state.process_index}; Length: {len(results)}"
|
)
|
||||||
state.wait_for_everyone()
|
state.wait_for_everyone()
|
||||||
|
|
||||||
|
|
||||||
@ -737,17 +737,17 @@ def test_split_between_processes_nested_dict():
|
|||||||
elif state.process_index == 3:
|
elif state.process_index == 3:
|
||||||
assert results["b"] == data_copy["b"][-2:]
|
assert results["b"] == data_copy["b"][-2:]
|
||||||
if state.process_index == 0:
|
if state.process_index == 0:
|
||||||
assert torch.allclose(
|
assert torch.allclose(results["c"], data_copy["c"][: 8 // state.num_processes]), (
|
||||||
results["c"], data_copy["c"][: 8 // state.num_processes]
|
f"Did not obtain expected values on process 0, expected `{data['c'][: 8 // state.num_processes]}`, received: {results['c']}"
|
||||||
), f"Did not obtain expected values on process 0, expected `{data['c'][: 8 // state.num_processes]}`, received: {results['c']}"
|
)
|
||||||
elif state.num_processes == 2:
|
elif state.num_processes == 2:
|
||||||
assert torch.allclose(
|
assert torch.allclose(results["c"], data_copy["c"][4:]), (
|
||||||
results["c"], data_copy["c"][4:]
|
f"Did not obtain expected values on process 2, expected `{data['c'][4:]}`, received: {results['c']}"
|
||||||
), f"Did not obtain expected values on process 2, expected `{data['c'][4:]}`, received: {results['c']}"
|
)
|
||||||
elif state.process_index == 3:
|
elif state.process_index == 3:
|
||||||
assert torch.allclose(
|
assert torch.allclose(results["c"], data_copy["c"][-2:]), (
|
||||||
results["c"], data_copy["c"][-2:]
|
f"Did not obtain expected values on process 4, expected `{data['c'][-2:]}`, received: {results['c']}"
|
||||||
), f"Did not obtain expected values on process 4, expected `{data['c'][-2:]}`, received: {results['c']}"
|
)
|
||||||
|
|
||||||
state.wait_for_everyone()
|
state.wait_for_everyone()
|
||||||
|
|
||||||
@ -773,13 +773,13 @@ def test_split_between_processes_evenly():
|
|||||||
num_extras = len(data) % state.num_processes
|
num_extras = len(data) % state.num_processes
|
||||||
with state.split_between_processes(data) as results:
|
with state.split_between_processes(data) as results:
|
||||||
if state.process_index < num_extras:
|
if state.process_index < num_extras:
|
||||||
assert (
|
assert len(results) == num_samples_per_process + 1, (
|
||||||
len(results) == num_samples_per_process + 1
|
f"Each Process should have even elements. Expected: {num_samples_per_process + 1}, Actual: {len(results)}"
|
||||||
), f"Each Process should have even elements. Expected: {num_samples_per_process + 1}, Actual: {len(results)}"
|
)
|
||||||
else:
|
else:
|
||||||
assert (
|
assert len(results) == num_samples_per_process, (
|
||||||
len(results) == num_samples_per_process
|
f"Each Process should have even elements. Expected: {num_samples_per_process}, Actual: {len(results)}"
|
||||||
), f"Each Process should have even elements. Expected: {num_samples_per_process}, Actual: {len(results)}"
|
)
|
||||||
state.wait_for_everyone()
|
state.wait_for_everyone()
|
||||||
|
|
||||||
|
|
||||||
|
@ -32,14 +32,14 @@ def check_model_parameters(model_a, model_b, did_step, iteration, **kwargs):
|
|||||||
continue
|
continue
|
||||||
if not did_step:
|
if not did_step:
|
||||||
# Grads should not be in sync
|
# Grads should not be in sync
|
||||||
assert (
|
assert torch.allclose(param.grad, grad_param.grad, **kwargs) is False, (
|
||||||
torch.allclose(param.grad, grad_param.grad, **kwargs) is False
|
f"Gradients in sync when they should not be at iteration {iteration}:\nmodel_a grad ({param.grad}) == model_b grad ({grad_param.grad})"
|
||||||
), f"Gradients in sync when they should not be at iteration {iteration}:\nmodel_a grad ({param.grad}) == model_b grad ({grad_param.grad})"
|
)
|
||||||
else:
|
else:
|
||||||
# Grads should be in sync
|
# Grads should be in sync
|
||||||
assert (
|
assert torch.allclose(param.grad, grad_param.grad, **kwargs) is True, (
|
||||||
torch.allclose(param.grad, grad_param.grad, **kwargs) is True
|
f"Gradients not in sync when they should be at iteration {iteration}:\nmodel_a grad ({param.grad}) != model_b grad ({grad_param.grad})"
|
||||||
), f"Gradients not in sync when they should be at iteration {iteration}:\nmodel_a grad ({param.grad}) != model_b grad ({grad_param.grad})"
|
)
|
||||||
|
|
||||||
|
|
||||||
def step_model(model, input, target, accelerator, do_backward=True):
|
def step_model(model, input, target, accelerator, do_backward=True):
|
||||||
@ -101,9 +101,9 @@ def test_noop_sync(accelerator):
|
|||||||
for param, ddp_param in zip(model.parameters(), ddp_model.parameters()):
|
for param, ddp_param in zip(model.parameters(), ddp_model.parameters()):
|
||||||
if not param.requires_grad:
|
if not param.requires_grad:
|
||||||
continue
|
continue
|
||||||
assert torch.allclose(
|
assert torch.allclose(param.grad, ddp_param.grad), (
|
||||||
param.grad, ddp_param.grad
|
f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
|
||||||
), f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
|
)
|
||||||
|
|
||||||
# Shuffle ddp_input on each iteration
|
# Shuffle ddp_input on each iteration
|
||||||
torch.manual_seed(1337 + iteration)
|
torch.manual_seed(1337 + iteration)
|
||||||
@ -136,14 +136,14 @@ def test_distributed_sync(accelerator):
|
|||||||
continue
|
continue
|
||||||
if iteration % 2 == 0:
|
if iteration % 2 == 0:
|
||||||
# Grads should not be in sync
|
# Grads should not be in sync
|
||||||
assert (
|
assert torch.allclose(param.grad, ddp_param.grad) is False, (
|
||||||
torch.allclose(param.grad, ddp_param.grad) is False
|
f"Gradients in sync when they should not be:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})"
|
||||||
), f"Gradients in sync when they should not be:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})"
|
)
|
||||||
else:
|
else:
|
||||||
# Grads should be in sync
|
# Grads should be in sync
|
||||||
assert (
|
assert torch.allclose(param.grad, ddp_param.grad) is True, (
|
||||||
torch.allclose(param.grad, ddp_param.grad) is True
|
f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
|
||||||
), f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
|
)
|
||||||
|
|
||||||
# Shuffle ddp_input on each iteration
|
# Shuffle ddp_input on each iteration
|
||||||
torch.manual_seed(1337 + iteration)
|
torch.manual_seed(1337 + iteration)
|
||||||
@ -185,9 +185,9 @@ def test_distributed_sync_multiple_fwd(accelerator):
|
|||||||
if not param.requires_grad:
|
if not param.requires_grad:
|
||||||
continue
|
continue
|
||||||
# Grads should not be in sync
|
# Grads should not be in sync
|
||||||
assert (
|
assert torch.allclose(param.grad, ddp_param.grad) is False, (
|
||||||
torch.allclose(param.grad, ddp_param.grad) is False
|
f"Gradients in sync when they should not be:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})"
|
||||||
), f"Gradients in sync when they should not be:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})"
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Sync grads if last backward
|
# Sync grads if last backward
|
||||||
@ -199,9 +199,9 @@ def test_distributed_sync_multiple_fwd(accelerator):
|
|||||||
if not param.requires_grad:
|
if not param.requires_grad:
|
||||||
continue
|
continue
|
||||||
# Grads should be in sync
|
# Grads should be in sync
|
||||||
assert (
|
assert torch.allclose(param.grad, ddp_param.grad) is True, (
|
||||||
torch.allclose(param.grad, ddp_param.grad) is True
|
f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
|
||||||
), f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_gradient_accumulation(split_batches=False, dispatch_batches=False, sync_each_batch=False):
|
def test_gradient_accumulation(split_batches=False, dispatch_batches=False, sync_each_batch=False):
|
||||||
@ -230,14 +230,14 @@ def test_gradient_accumulation(split_batches=False, dispatch_batches=False, sync
|
|||||||
continue
|
continue
|
||||||
if ((iteration + 1) % 2 == 0) or (iteration == len(dataloader) - 1) or sync_each_batch:
|
if ((iteration + 1) % 2 == 0) or (iteration == len(dataloader) - 1) or sync_each_batch:
|
||||||
# Grads should be in sync
|
# Grads should be in sync
|
||||||
assert (
|
assert torch.allclose(param.grad, ddp_param.grad) is True, (
|
||||||
torch.allclose(param.grad, ddp_param.grad) is True
|
f"Gradients not in sync when they should be at iteration {iteration}:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
|
||||||
), f"Gradients not in sync when they should be at iteration {iteration}:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
|
)
|
||||||
else:
|
else:
|
||||||
# Grads should not be in sync
|
# Grads should not be in sync
|
||||||
assert (
|
assert torch.allclose(param.grad, ddp_param.grad) is False, (
|
||||||
torch.allclose(param.grad, ddp_param.grad) is False
|
f"Gradients in sync when they should not be at iteration {iteration}:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})"
|
||||||
), f"Gradients in sync when they should not be at iteration {iteration}:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})"
|
)
|
||||||
|
|
||||||
# Shuffle ddp_input on each iteration
|
# Shuffle ddp_input on each iteration
|
||||||
torch.manual_seed(1337 + iteration)
|
torch.manual_seed(1337 + iteration)
|
||||||
@ -281,9 +281,9 @@ def test_gradient_accumulation_with_opt_and_scheduler(
|
|||||||
ddp_sched.step()
|
ddp_sched.step()
|
||||||
|
|
||||||
# Learning rates should be the same
|
# Learning rates should be the same
|
||||||
assert (
|
assert opt.param_groups[0]["lr"] == ddp_opt.param_groups[0]["lr"], (
|
||||||
opt.param_groups[0]["lr"] == ddp_opt.param_groups[0]["lr"]
|
f"Learning rates found in each optimizer did not align\nopt: {opt.param_groups[0]['lr']}\nDDP opt: {ddp_opt.param_groups[0]['lr']}\n"
|
||||||
), f"Learning rates found in each optimizer did not align\nopt: {opt.param_groups[0]['lr']}\nDDP opt: {ddp_opt.param_groups[0]['lr']}\n"
|
)
|
||||||
did_step = (((iteration + 1) % 2) == 0) or ((iteration + 1) == len(dataloader))
|
did_step = (((iteration + 1) % 2) == 0) or ((iteration + 1) == len(dataloader))
|
||||||
if accelerator.num_processes > 1:
|
if accelerator.num_processes > 1:
|
||||||
check_model_parameters(
|
check_model_parameters(
|
||||||
|
@ -177,9 +177,9 @@ class FSDPPluginIntegration(AccelerateTestCase):
|
|||||||
env["FSDP_BACKWARD_PREFETCH"] = prefetch_policy
|
env["FSDP_BACKWARD_PREFETCH"] = prefetch_policy
|
||||||
with patch_environment(**env), ctx as cm:
|
with patch_environment(**env), ctx as cm:
|
||||||
fsdp_plugin = FullyShardedDataParallelPlugin()
|
fsdp_plugin = FullyShardedDataParallelPlugin()
|
||||||
assert (
|
assert fsdp_plugin.backward_prefetch == expected_value, (
|
||||||
fsdp_plugin.backward_prefetch == expected_value
|
f"Actual: {fsdp_plugin.backward_prefetch} != Expected: {expected_value}"
|
||||||
), f"Actual: {fsdp_plugin.backward_prefetch} != Expected: {expected_value}"
|
)
|
||||||
if cm:
|
if cm:
|
||||||
self.assertTrue(any(_warning_message_fsdp2 in out for out in cm.output))
|
self.assertTrue(any(_warning_message_fsdp2 in out for out in cm.output))
|
||||||
|
|
||||||
|
@ -439,24 +439,24 @@ class AcceleratorTester(AccelerateTestCase):
|
|||||||
model, optimizer, scheduler, train_dl, valid_dl, dummy_obj = accelerator.prepare(
|
model, optimizer, scheduler, train_dl, valid_dl, dummy_obj = accelerator.prepare(
|
||||||
model, optimizer, scheduler, train_dl, valid_dl, dummy_obj
|
model, optimizer, scheduler, train_dl, valid_dl, dummy_obj
|
||||||
)
|
)
|
||||||
assert (
|
assert getattr(dummy_obj, "_is_accelerate_prepared", False) is False, (
|
||||||
getattr(dummy_obj, "_is_accelerate_prepared", False) is False
|
"Dummy object should have `_is_accelerate_prepared` set to `True`"
|
||||||
), "Dummy object should have `_is_accelerate_prepared` set to `True`"
|
)
|
||||||
assert (
|
assert getattr(model, "_is_accelerate_prepared", False) is True, (
|
||||||
getattr(model, "_is_accelerate_prepared", False) is True
|
"Model is missing `_is_accelerator_prepared` or is set to `False`"
|
||||||
), "Model is missing `_is_accelerator_prepared` or is set to `False`"
|
)
|
||||||
assert (
|
assert getattr(optimizer, "_is_accelerate_prepared", False) is True, (
|
||||||
getattr(optimizer, "_is_accelerate_prepared", False) is True
|
"Optimizer is missing `_is_accelerator_prepared` or is set to `False`"
|
||||||
), "Optimizer is missing `_is_accelerator_prepared` or is set to `False`"
|
)
|
||||||
assert (
|
assert getattr(scheduler, "_is_accelerate_prepared", False) is True, (
|
||||||
getattr(scheduler, "_is_accelerate_prepared", False) is True
|
"Scheduler is missing `_is_accelerator_prepared` or is set to `False`"
|
||||||
), "Scheduler is missing `_is_accelerator_prepared` or is set to `False`"
|
)
|
||||||
assert (
|
assert getattr(train_dl, "_is_accelerate_prepared", False) is True, (
|
||||||
getattr(train_dl, "_is_accelerate_prepared", False) is True
|
"Train Dataloader is missing `_is_accelerator_prepared` or is set to `False`"
|
||||||
), "Train Dataloader is missing `_is_accelerator_prepared` or is set to `False`"
|
)
|
||||||
assert (
|
assert getattr(valid_dl, "_is_accelerate_prepared", False) is True, (
|
||||||
getattr(valid_dl, "_is_accelerate_prepared", False) is True
|
"Valid Dataloader is missing `_is_accelerator_prepared` or is set to `False`"
|
||||||
), "Valid Dataloader is missing `_is_accelerator_prepared` or is set to `False`"
|
)
|
||||||
|
|
||||||
@require_cuda_or_xpu
|
@require_cuda_or_xpu
|
||||||
@slow
|
@slow
|
||||||
|
@ -498,16 +498,16 @@ class ModelEstimatorTester(unittest.TestCase):
|
|||||||
total_training_size_estimate = total_size_estimate * 4
|
total_training_size_estimate = total_size_estimate * 4
|
||||||
|
|
||||||
assert precision_str == output[i][0], f"Output is missing precision `{precision_str}`"
|
assert precision_str == output[i][0], f"Output is missing precision `{precision_str}`"
|
||||||
assert (
|
assert largest_layer_estimate == output[i][1], (
|
||||||
largest_layer_estimate == output[i][1]
|
f"Calculation for largest layer size in `{precision_str}` is incorrect."
|
||||||
), f"Calculation for largest layer size in `{precision_str}` is incorrect."
|
)
|
||||||
|
|
||||||
assert (
|
assert total_size_estimate == output[i][2], (
|
||||||
total_size_estimate == output[i][2]
|
f"Calculation for total size in `{precision_str}` is incorrect."
|
||||||
), f"Calculation for total size in `{precision_str}` is incorrect."
|
)
|
||||||
assert total_training_size_estimate == max(
|
assert total_training_size_estimate == max(output[i][3].values()), (
|
||||||
output[i][3].values()
|
f"Calculation for total training size in `{precision_str}` is incorrect."
|
||||||
), f"Calculation for total training size in `{precision_str}` is incorrect."
|
)
|
||||||
|
|
||||||
@require_transformers
|
@require_transformers
|
||||||
def test_transformers_model(self):
|
def test_transformers_model(self):
|
||||||
@ -515,12 +515,12 @@ class ModelEstimatorTester(unittest.TestCase):
|
|||||||
output = gather_data(args)
|
output = gather_data(args)
|
||||||
# The largest layer and total size of the model in bytes
|
# The largest layer and total size of the model in bytes
|
||||||
largest_layer, total_size = 90669056, 433249280
|
largest_layer, total_size = 90669056, 433249280
|
||||||
assert (
|
assert largest_layer == output[0][1], (
|
||||||
largest_layer == output[0][1]
|
f"Calculation for largest layer size in `fp32` is incorrect, expected {largest_layer} but received {output[0][1]}"
|
||||||
), f"Calculation for largest layer size in `fp32` is incorrect, expected {largest_layer} but received {output[0][1]}"
|
)
|
||||||
assert (
|
assert total_size == output[0][2], (
|
||||||
total_size == output[0][2]
|
f"Calculation for total size in `fp32` is incorrect, expected {total_size} but received {output[0][2]}"
|
||||||
), f"Calculation for total size in `fp32` is incorrect, expected {total_size} but received {output[0][2]}"
|
)
|
||||||
|
|
||||||
@require_transformers
|
@require_transformers
|
||||||
def test_no_split_modules(self):
|
def test_no_split_modules(self):
|
||||||
@ -538,12 +538,12 @@ class ModelEstimatorTester(unittest.TestCase):
|
|||||||
output = gather_data(args)
|
output = gather_data(args)
|
||||||
# The largest layer and total size of the model in bytes
|
# The largest layer and total size of the model in bytes
|
||||||
largest_layer, total_size = 9437184, 102441032
|
largest_layer, total_size = 9437184, 102441032
|
||||||
assert (
|
assert largest_layer == output[0][1], (
|
||||||
largest_layer == output[0][1]
|
f"Calculation for largest layer size in `fp32` is incorrect, expected {largest_layer} but received {output[0][1]}"
|
||||||
), f"Calculation for largest layer size in `fp32` is incorrect, expected {largest_layer} but received {output[0][1]}"
|
)
|
||||||
assert (
|
assert total_size == output[0][2], (
|
||||||
total_size == output[0][2]
|
f"Calculation for total size in `fp32` is incorrect, expected {total_size} but received {output[0][2]}"
|
||||||
), f"Calculation for total size in `fp32` is incorrect, expected {total_size} but received {output[0][2]}"
|
)
|
||||||
|
|
||||||
|
|
||||||
class ToFSDP2Tester(unittest.TestCase):
|
class ToFSDP2Tester(unittest.TestCase):
|
||||||
|
@ -55,9 +55,9 @@ def can_convert_te_model():
|
|||||||
|
|
||||||
|
|
||||||
def maintain_proper_deepspeed_config(expected_version):
|
def maintain_proper_deepspeed_config(expected_version):
|
||||||
assert (
|
assert AcceleratorState().deepspeed_plugin.zero_stage == expected_version, (
|
||||||
AcceleratorState().deepspeed_plugin.zero_stage == expected_version
|
f"Expected zero stage {expected_version} but got {AcceleratorState().deepspeed_plugin.zero_stage}"
|
||||||
), f"Expected zero stage {expected_version} but got {AcceleratorState().deepspeed_plugin.zero_stage}"
|
)
|
||||||
|
|
||||||
|
|
||||||
def can_convert_ao_model():
|
def can_convert_ao_model():
|
||||||
|
@ -33,13 +33,13 @@ def one_cycle_test(num_processes=2, step_scheduler_with_optimizer=True, split_ba
|
|||||||
# Optimizer has stepped
|
# Optimizer has stepped
|
||||||
scheduler.step()
|
scheduler.step()
|
||||||
if step_scheduler_with_optimizer or (num_processes == 1):
|
if step_scheduler_with_optimizer or (num_processes == 1):
|
||||||
assert (
|
assert scheduler.scheduler.last_epoch == num_processes, (
|
||||||
scheduler.scheduler.last_epoch == num_processes
|
f"Last Epoch ({scheduler.scheduler.last_epoch}) != Num Processes ({num_processes})"
|
||||||
), f"Last Epoch ({scheduler.scheduler.last_epoch}) != Num Processes ({num_processes})"
|
)
|
||||||
else:
|
else:
|
||||||
assert (
|
assert scheduler.scheduler.last_epoch != num_processes, (
|
||||||
scheduler.scheduler.last_epoch != num_processes
|
f"Last Epoch ({scheduler.scheduler.last_epoch}) == Num Processes ({num_processes})"
|
||||||
), f"Last Epoch ({scheduler.scheduler.last_epoch}) == Num Processes ({num_processes})"
|
)
|
||||||
|
|
||||||
|
|
||||||
def lambda_test(num_processes=2, step_scheduler_with_optimizer=True, split_batches=False):
|
def lambda_test(num_processes=2, step_scheduler_with_optimizer=True, split_batches=False):
|
||||||
@ -53,18 +53,18 @@ def lambda_test(num_processes=2, step_scheduler_with_optimizer=True, split_batch
|
|||||||
optimizer._is_overflow = False
|
optimizer._is_overflow = False
|
||||||
scheduler.step()
|
scheduler.step()
|
||||||
expected_lr = 1 - (num_processes if (step_scheduler_with_optimizer and not split_batches) else 1) / 10
|
expected_lr = 1 - (num_processes if (step_scheduler_with_optimizer and not split_batches) else 1) / 10
|
||||||
assert (
|
assert scheduler.get_last_lr()[0] == expected_lr, (
|
||||||
scheduler.get_last_lr()[0] == expected_lr
|
f"Wrong lr found at first step, expected {expected_lr}, got {scheduler.get_last_lr()[0]}"
|
||||||
), f"Wrong lr found at first step, expected {expected_lr}, got {scheduler.get_last_lr()[0]}"
|
)
|
||||||
|
|
||||||
# Optimizer has not stepped
|
# Optimizer has not stepped
|
||||||
optimizer._is_overflow = True
|
optimizer._is_overflow = True
|
||||||
scheduler.step()
|
scheduler.step()
|
||||||
if not step_scheduler_with_optimizer:
|
if not step_scheduler_with_optimizer:
|
||||||
expected_lr = 1 - 2 / 10
|
expected_lr = 1 - 2 / 10
|
||||||
assert (
|
assert scheduler.get_last_lr()[0] == expected_lr, (
|
||||||
scheduler.get_last_lr()[0] == expected_lr
|
f"Wrong lr found at second step, expected {expected_lr}, got {scheduler.get_last_lr()[0]}"
|
||||||
), f"Wrong lr found at second step, expected {expected_lr}, got {scheduler.get_last_lr()[0]}"
|
)
|
||||||
|
|
||||||
|
|
||||||
def accumulation_test(num_processes: int = 2):
|
def accumulation_test(num_processes: int = 2):
|
||||||
@ -92,12 +92,12 @@ def accumulation_test(num_processes: int = 2):
|
|||||||
scheduler.step()
|
scheduler.step()
|
||||||
|
|
||||||
if i == (10 * num_steps - 2):
|
if i == (10 * num_steps - 2):
|
||||||
assert (
|
assert scheduler.get_last_lr()[0] != 0, (
|
||||||
scheduler.get_last_lr()[0] != 0
|
f"Wrong lr found at second-to-last step, expected non-zero, got {scheduler.get_last_lr()[0]}. num_steps: {num_steps}"
|
||||||
), f"Wrong lr found at second-to-last step, expected non-zero, got {scheduler.get_last_lr()[0]}. num_steps: {num_steps}"
|
)
|
||||||
assert (
|
assert scheduler.get_last_lr()[0] == 0, (
|
||||||
scheduler.get_last_lr()[0] == 0
|
f"Wrong lr found at last step, expected 0, got {scheduler.get_last_lr()[0]}"
|
||||||
), f"Wrong lr found at last step, expected 0, got {scheduler.get_last_lr()[0]}"
|
)
|
||||||
GradientState._reset_state()
|
GradientState._reset_state()
|
||||||
|
|
||||||
|
|
||||||
|
@ -421,9 +421,9 @@ if __name__ == "__main__":
|
|||||||
for group in optimizer.param_groups:
|
for group in optimizer.param_groups:
|
||||||
param_device = group["params"][0].device
|
param_device = group["params"][0].device
|
||||||
break
|
break
|
||||||
assert (
|
assert param_device.type == torch.device("cpu").type, (
|
||||||
param_device.type == torch.device("cpu").type
|
f"Loaded optimizer states did not match, expected to be loaded on the CPU but got {param_device}"
|
||||||
), f"Loaded optimizer states did not match, expected to be loaded on the CPU but got {param_device}"
|
)
|
||||||
|
|
||||||
# Check device state
|
# Check device state
|
||||||
model.to(accelerator.device)
|
model.to(accelerator.device)
|
||||||
@ -431,9 +431,9 @@ if __name__ == "__main__":
|
|||||||
for group in optimizer.param_groups:
|
for group in optimizer.param_groups:
|
||||||
param_device = group["params"][0].device
|
param_device = group["params"][0].device
|
||||||
break
|
break
|
||||||
assert (
|
assert param_device.type == accelerator.device.type, (
|
||||||
param_device.type == accelerator.device.type
|
f"Loaded optimizer states did not match, expected to be loaded on {accelerator.device} but got {param_device}"
|
||||||
), f"Loaded optimizer states did not match, expected to be loaded on {accelerator.device} but got {param_device}"
|
)
|
||||||
|
|
||||||
# Check error
|
# Check error
|
||||||
with pytest.raises(TypeError, match="Unsupported optimizer map location passed"):
|
with pytest.raises(TypeError, match="Unsupported optimizer map location passed"):
|
||||||
|
@ -40,9 +40,7 @@ def parse_args():
|
|||||||
"""
|
"""
|
||||||
parser = ArgumentParser(
|
parser = ArgumentParser(
|
||||||
description=(
|
description=(
|
||||||
"PyTorch TPU distributed training launch "
|
"PyTorch TPU distributed training launch helper utility that will spawn up multiple distributed processes"
|
||||||
"helper utility that will spawn up "
|
|
||||||
"multiple distributed processes"
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -56,7 +56,7 @@ for log in Path().glob("*.log"):
|
|||||||
if line.get("nodeid", "") != "":
|
if line.get("nodeid", "") != "":
|
||||||
test = line["nodeid"]
|
test = line["nodeid"]
|
||||||
if line.get("duration", None) is not None:
|
if line.get("duration", None) is not None:
|
||||||
duration = f'{line["duration"]:.4f}'
|
duration = f"{line['duration']:.4f}"
|
||||||
if line.get("outcome", "") == "failed":
|
if line.get("outcome", "") == "failed":
|
||||||
section_num_failed += 1
|
section_num_failed += 1
|
||||||
failed.append([test, duration, log.name.split("_")[0]])
|
failed.append([test, duration, log.name.split("_")[0]])
|
||||||
@ -136,7 +136,7 @@ if os.environ.get("TEST_TYPE", "") != "":
|
|||||||
"text": "Check Action results",
|
"text": "Check Action results",
|
||||||
"emoji": True,
|
"emoji": True,
|
||||||
},
|
},
|
||||||
"url": f'https://github.com/{os.environ["GITHUB_REPOSITORY"]}/actions/runs/{os.environ["GITHUB_RUN_ID"]}',
|
"url": f"https://github.com/{os.environ['GITHUB_REPOSITORY']}/actions/runs/{os.environ['GITHUB_RUN_ID']}",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
payload.append(action_button)
|
payload.append(action_button)
|
||||||
|
Reference in New Issue
Block a user