Bump ruff to 0.11.2 (#3471)

* ruff format

* Bump ruff to 0.11.2
This commit is contained in:
cyyever
2025-04-01 17:57:06 +08:00
committed by GitHub
parent 67a768be07
commit 3169339f5b
31 changed files with 456 additions and 458 deletions

View File

@ -62,12 +62,12 @@ def train_baseline(opt_level="O2"):
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator) trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results return base_model_results, trained_model_results
@ -95,12 +95,12 @@ def train_integration(opt_level="O2"):
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator) trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results return base_model_results, trained_model_results
@ -109,15 +109,15 @@ if __name__ == "__main__":
for opt_level in ["O1", "O2"]: for opt_level in ["O1", "O2"]:
baseline_not_trained, baseline_trained = train_baseline(opt_level) baseline_not_trained, baseline_trained = train_baseline(opt_level)
accelerator_not_trained, accelerator_trained = train_integration(opt_level) accelerator_not_trained, accelerator_trained = train_integration(opt_level)
assert ( assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"] f"Accuracy not the same for untrained baseline and accelerator using opt_level={opt_level}: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
), f'Accuracy not the same for untrained baseline and accelerator using opt_level={opt_level}: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}' )
assert ( assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
baseline_not_trained["f1"] == accelerator_not_trained["f1"] f"F1 not the same for untrained baseline and accelerator using opt_level={opt_level}: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
), f'F1 not the same for untrained baseline and accelerator using opt_level={opt_level}: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}' )
assert ( assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
baseline_trained["accuracy"] == accelerator_trained["accuracy"] f"Accuracy not the same for trained baseline and accelerator using opt_level={opt_level}: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
), f'Accuracy not the same for trained baseline and accelerator using opt_level={opt_level}: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}' )
assert ( assert baseline_trained["f1"] == accelerator_trained["f1"], (
baseline_trained["f1"] == accelerator_trained["f1"] f"F1 not the same for trained baseline and accelerator using opt_level={opt_level}: {baseline_trained['f1']} == {accelerator_trained['f1']}"
), f'F1 not the same for trained baseline and accelerator using opt_level={opt_level}: {baseline_trained["f1"]} == {accelerator_trained["f1"]}' )

View File

@ -90,12 +90,12 @@ def train_baseline(zero_stage: int = 1, opt_level: str = "O1"):
model.destroy() model.destroy()
torch.cuda.empty_cache() torch.cuda.empty_cache()
AcceleratorState()._reset_state(True) AcceleratorState()._reset_state(True)
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results return base_model_results, trained_model_results
@ -129,12 +129,12 @@ def train_integration(zero_stage: int = 1, opt_level: str = "O1"):
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator) trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
model.destroy() model.destroy()
torch.cuda.empty_cache() torch.cuda.empty_cache()
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
AcceleratorState()._reset_state(True) AcceleratorState()._reset_state(True)
return base_model_results, trained_model_results return base_model_results, trained_model_results
@ -145,17 +145,17 @@ if __name__ == "__main__":
for opt_level in ["O1", "O2", "O3"]: for opt_level in ["O1", "O2", "O3"]:
baseline_not_trained, baseline_trained = train_baseline(zero_stage, opt_level) baseline_not_trained, baseline_trained = train_baseline(zero_stage, opt_level)
accelerator_not_trained, accelerator_trained = train_integration(zero_stage, opt_level) accelerator_not_trained, accelerator_trained = train_integration(zero_stage, opt_level)
assert ( assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"] f"ZERO stage {zero_stage}, opt_level={opt_level}:\nAccuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
), f'ZERO stage {zero_stage}, opt_level={opt_level}:\nAccuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}' )
assert ( assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
baseline_not_trained["f1"] == accelerator_not_trained["f1"] f"ZERO stage {zero_stage}, opt_level={opt_level}:\nF1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
), f'ZERO stage {zero_stage}, opt_level={opt_level}:\nF1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}' )
assert ( assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
baseline_trained["accuracy"] == accelerator_trained["accuracy"] f"ZERO stage {zero_stage}, opt_level={opt_level}:\nAccuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
), f'ZERO stage {zero_stage}, opt_level={opt_level}:\nAccuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}' )
assert ( assert baseline_trained["f1"] == accelerator_trained["f1"], (
baseline_trained["f1"] == accelerator_trained["f1"] f"ZERO stage {zero_stage}, opt_level={opt_level}:\nF1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
), f'ZERO stage {zero_stage}, opt_level={opt_level}:\nF1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}' )
torch.distributed.destroy_process_group() torch.distributed.destroy_process_group()

View File

@ -56,12 +56,12 @@ def train_baseline(opt_level="O2"):
trained_model_results = evaluate_model(model, eval_dataloader, METRIC) trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results return base_model_results, trained_model_results
@ -89,12 +89,12 @@ def train_integration(opt_level="O2"):
trained_model_results = evaluate_model(model, eval_dataloader, METRIC) trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results return base_model_results, trained_model_results
@ -104,15 +104,15 @@ if __name__ == "__main__":
baseline_not_trained, baseline_trained = train_baseline(opt_level) baseline_not_trained, baseline_trained = train_baseline(opt_level)
accelerator_not_trained, accelerator_trained = train_integration(opt_level) accelerator_not_trained, accelerator_trained = train_integration(opt_level)
assert ( assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"] f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}' )
assert ( assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
baseline_not_trained["f1"] == accelerator_not_trained["f1"] f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}' )
assert ( assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
baseline_trained["accuracy"] == accelerator_trained["accuracy"] f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}' )
assert ( assert baseline_trained["f1"] == accelerator_trained["f1"], (
baseline_trained["f1"] == accelerator_trained["f1"] f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}' )

View File

@ -96,12 +96,12 @@ def train_baseline():
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator) trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results return base_model_results, trained_model_results
@ -128,12 +128,12 @@ def train_integration():
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator) trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results return base_model_results, trained_model_results
@ -142,17 +142,17 @@ if __name__ == "__main__":
baseline_not_trained, baseline_trained = train_baseline() baseline_not_trained, baseline_trained = train_baseline()
accelerator_not_trained, accelerator_trained = train_integration() accelerator_not_trained, accelerator_trained = train_integration()
assert ( assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"] f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}' )
assert ( assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
baseline_not_trained["f1"] == accelerator_not_trained["f1"] f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}' )
assert ( assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
baseline_trained["accuracy"] == accelerator_trained["accuracy"] f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}' )
assert ( assert baseline_trained["f1"] == accelerator_trained["f1"], (
baseline_trained["f1"] == accelerator_trained["f1"] f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}' )
torch.distributed.destroy_process_group() torch.distributed.destroy_process_group()

View File

@ -126,12 +126,12 @@ def train_baseline(zero_stage: int = 1):
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator) trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
model.destroy() model.destroy()
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
del config del config
return base_model_results, trained_model_results, model_outputs, data return base_model_results, trained_model_results, model_outputs, data
@ -180,12 +180,12 @@ def train_integration(zero_stage: int = 1):
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator) trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
model.destroy() model.destroy()
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
del config del config
return base_model_results, trained_model_results, model_outputs, data return base_model_results, trained_model_results, model_outputs, data
@ -197,17 +197,17 @@ if __name__ == "__main__":
accelerator_not_trained, accelerator_trained, accelerator_outputs, accelerator_data = train_integration( accelerator_not_trained, accelerator_trained, accelerator_outputs, accelerator_data = train_integration(
zero_stage zero_stage
) )
assert ( assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"] f"ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
), f'ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}' )
assert ( assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
baseline_not_trained["f1"] == accelerator_not_trained["f1"] f"ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
), f'ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}' )
assert ( assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
baseline_trained["accuracy"] == accelerator_trained["accuracy"] f"ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
), f'ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}' )
assert ( assert baseline_trained["f1"] == accelerator_trained["f1"], (
baseline_trained["f1"] == accelerator_trained["f1"] f"ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
), f'ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}' )
AcceleratorState()._reset_state(True) AcceleratorState()._reset_state(True)
torch.distributed.destroy_process_group() torch.distributed.destroy_process_group()

View File

@ -106,12 +106,12 @@ def train_baseline():
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator) trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results return base_model_results, trained_model_results
@ -143,12 +143,12 @@ def train_integration():
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator) trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results return base_model_results, trained_model_results
@ -157,17 +157,17 @@ if __name__ == "__main__":
baseline_not_trained, baseline_trained = train_baseline() baseline_not_trained, baseline_trained = train_baseline()
accelerator_not_trained, accelerator_trained = train_integration() accelerator_not_trained, accelerator_trained = train_integration()
assert ( assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"] f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}' )
assert ( assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
baseline_not_trained["f1"] == accelerator_not_trained["f1"] f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}' )
assert ( assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
baseline_trained["accuracy"] == accelerator_trained["accuracy"] f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}' )
assert ( assert baseline_trained["f1"] == accelerator_trained["f1"], (
baseline_trained["f1"] == accelerator_trained["f1"] f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}' )
torch.distributed.destroy_process_group() torch.distributed.destroy_process_group()

View File

@ -87,12 +87,12 @@ def train_baseline():
trained_model_results = evaluate_model(model, eval_dataloader, METRIC) trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results return base_model_results, trained_model_results
@ -117,12 +117,12 @@ def train_integration():
trained_model_results = evaluate_model(model, eval_dataloader, METRIC) trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results return base_model_results, trained_model_results
@ -131,15 +131,15 @@ if __name__ == "__main__":
baseline_not_trained, baseline_trained = train_baseline() baseline_not_trained, baseline_trained = train_baseline()
AcceleratorState._reset_state(True) AcceleratorState._reset_state(True)
accelerator_not_trained, accelerator_trained = train_integration() accelerator_not_trained, accelerator_trained = train_integration()
assert ( assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"] f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}' )
assert ( assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
baseline_not_trained["f1"] == accelerator_not_trained["f1"] f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}' )
assert ( assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
baseline_trained["accuracy"] == accelerator_trained["accuracy"] f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}' )
assert ( assert baseline_trained["f1"] == accelerator_trained["f1"], (
baseline_trained["f1"] == accelerator_trained["f1"] f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}' )

View File

@ -79,12 +79,12 @@ def train_baseline():
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator) trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results return base_model_results, trained_model_results
@ -114,12 +114,12 @@ def train_integration():
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator) trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results return base_model_results, trained_model_results
@ -128,17 +128,17 @@ if __name__ == "__main__":
baseline_not_trained, baseline_trained = train_baseline() baseline_not_trained, baseline_trained = train_baseline()
accelerator_not_trained, accelerator_trained = train_integration() accelerator_not_trained, accelerator_trained = train_integration()
assert ( assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"] f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}' )
assert ( assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
baseline_not_trained["f1"] == accelerator_not_trained["f1"] f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}' )
assert ( assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
baseline_trained["accuracy"] == accelerator_trained["accuracy"] f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}' )
assert ( assert baseline_trained["f1"] == accelerator_trained["f1"], (
baseline_trained["f1"] == accelerator_trained["f1"] f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}' )
torch.distributed.destroy_process_group() torch.distributed.destroy_process_group()

View File

@ -113,12 +113,12 @@ def train_baseline(zero_stage: int = 1):
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator) trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
model.destroy() model.destroy()
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results, model_outputs, data return base_model_results, trained_model_results, model_outputs, data
@ -159,12 +159,12 @@ def train_integration(zero_stage: int = 1):
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator) trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
model.destroy() model.destroy()
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results, model_outputs, data return base_model_results, trained_model_results, model_outputs, data
@ -175,17 +175,17 @@ if __name__ == "__main__":
accelerator_not_trained, accelerator_trained, accelerator_outputs, accelerator_data = train_integration( accelerator_not_trained, accelerator_trained, accelerator_outputs, accelerator_data = train_integration(
zero_stage zero_stage
) )
assert ( assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"] f"ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
), f'ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}' )
assert ( assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
baseline_not_trained["f1"] == accelerator_not_trained["f1"] f"ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
), f'ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}' )
assert ( assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
baseline_trained["accuracy"] == accelerator_trained["accuracy"] f"ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
), f'ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}' )
assert ( assert baseline_trained["f1"] == accelerator_trained["f1"], (
baseline_trained["f1"] == accelerator_trained["f1"] f"ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
), f'ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}' )
torch.distributed.destroy_process_group() torch.distributed.destroy_process_group()

View File

@ -91,12 +91,12 @@ def train_baseline():
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator) trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results return base_model_results, trained_model_results
@ -131,12 +131,12 @@ def train_integration():
trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator) trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results return base_model_results, trained_model_results
@ -145,17 +145,17 @@ if __name__ == "__main__":
baseline_not_trained, baseline_trained = train_baseline() baseline_not_trained, baseline_trained = train_baseline()
accelerator_not_trained, accelerator_trained = train_integration() accelerator_not_trained, accelerator_trained = train_integration()
assert ( assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"] f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}' )
assert ( assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
baseline_not_trained["f1"] == accelerator_not_trained["f1"] f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}' )
assert ( assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
baseline_trained["accuracy"] == accelerator_trained["accuracy"] f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}' )
assert ( assert baseline_trained["f1"] == accelerator_trained["f1"], (
baseline_trained["f1"] == accelerator_trained["f1"] f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}' )
torch.distributed.destroy_process_group() torch.distributed.destroy_process_group()

View File

@ -70,12 +70,12 @@ def train_baseline():
trained_model_results = evaluate_model(model, eval_dataloader, METRIC) trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results return base_model_results, trained_model_results
@ -104,12 +104,12 @@ def train_integration():
trained_model_results = evaluate_model(model, eval_dataloader, METRIC) trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
assert ( assert trained_model_results["accuracy"] > base_model_results["accuracy"], (
trained_model_results["accuracy"] > base_model_results["accuracy"] f"Accuracy should be higher for the trained model: {trained_model_results['accuracy']} > {base_model_results['accuracy']}"
), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}' )
assert ( assert trained_model_results["f1"] > base_model_results["f1"], (
trained_model_results["f1"] > base_model_results["f1"] f"F1 score should be higher for the trained model: {trained_model_results['f1']} > {base_model_results['f1']}"
), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}' )
return base_model_results, trained_model_results return base_model_results, trained_model_results
@ -118,15 +118,15 @@ if __name__ == "__main__":
baseline_not_trained, baseline_trained = train_baseline() baseline_not_trained, baseline_trained = train_baseline()
accelerator_not_trained, accelerator_trained = train_integration() accelerator_not_trained, accelerator_trained = train_integration()
assert ( assert baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"], (
baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"] f"Accuracy should be the same for the baseline and accelerator: {baseline_not_trained['accuracy']} == {accelerator_not_trained['accuracy']}"
), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}' )
assert ( assert baseline_not_trained["f1"] == accelerator_not_trained["f1"], (
baseline_not_trained["f1"] == accelerator_not_trained["f1"] f"F1 score should be the same for the baseline and accelerator: {baseline_not_trained['f1']} == {accelerator_not_trained['f1']}"
), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}' )
assert ( assert baseline_trained["accuracy"] == accelerator_trained["accuracy"], (
baseline_trained["accuracy"] == accelerator_trained["accuracy"] f"Accuracy should be the same for the baseline and accelerator: {baseline_trained['accuracy']} == {accelerator_trained['accuracy']}"
), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}' )
assert ( assert baseline_trained["f1"] == accelerator_trained["f1"], (
baseline_trained["f1"] == accelerator_trained["f1"] f"F1 score should be the same for the baseline and accelerator: {baseline_trained['f1']} == {accelerator_trained['f1']}"
), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}' )

View File

@ -59,7 +59,7 @@ def evaluate(args, config: dict, init_fn: Callable, run_name: str) -> torch.Tens
Loss: {loss[-1].item()} Loss: {loss[-1].item()}
Peak Allocated Memory: {float(memory_tracker.peak_allocated_memory):.2f} MB Peak Allocated Memory: {float(memory_tracker.peak_allocated_memory):.2f} MB
Peak Reserved Memory: {float(memory_tracker.peak_reserved_memory):.2f} MB Peak Reserved Memory: {float(memory_tracker.peak_reserved_memory):.2f} MB
{'-' * 34}""" {"-" * 34}"""
accelerator.print(msg) accelerator.print(msg)
return loss return loss

View File

@ -611,7 +611,7 @@ def main():
if isinstance(checkpointing_steps, int): if isinstance(checkpointing_steps, int):
if completed_steps % checkpointing_steps == 0: if completed_steps % checkpointing_steps == 0:
output_dir = f"step_{completed_steps }" output_dir = f"step_{completed_steps}"
if args.output_dir is not None: if args.output_dir is not None:
output_dir = os.path.join(args.output_dir, output_dir) output_dir = os.path.join(args.output_dir, output_dir)
accelerator.save_state(output_dir) accelerator.save_state(output_dir)

View File

@ -19,7 +19,7 @@ extras = {}
extras["quality"] = [ extras["quality"] = [
"black ~= 23.1", # hf-doc-builder has a hidden dependency on `black` "black ~= 23.1", # hf-doc-builder has a hidden dependency on `black`
"hf-doc-builder >= 0.3.0", "hf-doc-builder >= 0.3.0",
"ruff ~= 0.6.4", "ruff ~= 0.11.2",
] ]
extras["docs"] = [] extras["docs"] = []
extras["test_prod"] = ["pytest>=7.2.0,<=8.0.0", "pytest-xdist", "pytest-subtests", "parameterized", "pytest-order"] extras["test_prod"] = ["pytest>=7.2.0,<=8.0.0", "pytest-xdist", "pytest-subtests", "parameterized", "pytest-order"]

View File

@ -445,9 +445,9 @@ class Accelerator:
self.has_fp8_handler = False self.has_fp8_handler = False
if kwargs_handlers is not None: if kwargs_handlers is not None:
for handler in kwargs_handlers: for handler in kwargs_handlers:
assert isinstance( assert isinstance(handler, KwargsHandler), (
handler, KwargsHandler f"Unsupported kwargs handler passed: {handler}, must be one that inherits `accelerate.utils.KwargsHandler`."
), f"Unsupported kwargs handler passed: {handler}, must be one that inherits `accelerate.utils.KwargsHandler`." )
# Add the handler class to the set of found handlers # Add the handler class to the set of found handlers
if handler.__class__ in found_handlers: if handler.__class__ in found_handlers:
raise ValueError(f"You can only pass one {handler.__class__} in `kwargs_handlers`.") raise ValueError(f"You can only pass one {handler.__class__} in `kwargs_handlers`.")

View File

@ -228,9 +228,9 @@ def get_cluster_input():
) )
if use_deepspeed: if use_deepspeed:
distributed_type = DistributedType.DEEPSPEED distributed_type = DistributedType.DEEPSPEED
assert ( assert is_deepspeed_available(), (
is_deepspeed_available() "DeepSpeed is not installed => run `pip3 install deepspeed` or build it from source"
), "DeepSpeed is not installed => run `pip3 install deepspeed` or build it from source" )
if distributed_type == DistributedType.DEEPSPEED: if distributed_type == DistributedType.DEEPSPEED:
use_deepspeed_config = _ask_field( use_deepspeed_config = _ask_field(

View File

@ -184,12 +184,12 @@ def training_function(config, args):
with open(os.path.join(args.output_dir, f"state_{starting_epoch - 1}.json")) as f: with open(os.path.join(args.output_dir, f"state_{starting_epoch - 1}.json")) as f:
resumed_state = json.load(f) resumed_state = json.load(f)
assert resumed_state["accuracy"] == accuracy, "Accuracy mismatch, loading from checkpoint failed" assert resumed_state["accuracy"] == accuracy, "Accuracy mismatch, loading from checkpoint failed"
assert ( assert resumed_state["lr"] == lr_scheduler.get_lr()[0], (
resumed_state["lr"] == lr_scheduler.get_lr()[0] "Scheduler learning rate mismatch, loading from checkpoint failed"
), "Scheduler learning rate mismatch, loading from checkpoint failed" )
assert ( assert resumed_state["optimizer_lr"] == optimizer.param_groups[0]["lr"], (
resumed_state["optimizer_lr"] == optimizer.param_groups[0]["lr"] "Optimizer learning rate mismatch, loading from checkpoint failed"
), "Optimizer learning rate mismatch, loading from checkpoint failed" )
assert resumed_state["epoch"] == starting_epoch - 1, "Epoch mismatch, loading from checkpoint failed" assert resumed_state["epoch"] == starting_epoch - 1, "Epoch mismatch, loading from checkpoint failed"
return return

View File

@ -115,9 +115,9 @@ def test_torch_metrics(
): ):
_, ddp_model, dataloader = get_basic_setup(accelerator, num_samples, batch_size) _, ddp_model, dataloader = get_basic_setup(accelerator, num_samples, batch_size)
logits, _ = generate_predictions(ddp_model, dataloader, accelerator) logits, _ = generate_predictions(ddp_model, dataloader, accelerator)
assert ( assert len(logits) == num_samples, (
len(logits) == num_samples f"Unexpected number of inputs:\n Expected: {num_samples}\n Actual: {len(logits)}"
), f"Unexpected number of inputs:\n Expected: {num_samples}\n Actual: {len(logits)}" )
def test_mrpc(dispatch_batches: bool = False, split_batches: bool = False): def test_mrpc(dispatch_batches: bool = False, split_batches: bool = False):
@ -148,9 +148,9 @@ def test_mrpc(dispatch_batches: bool = False, split_batches: bool = False):
distributed = metric.compute() distributed = metric.compute()
for key in "accuracy f1".split(): for key in "accuracy f1".split():
assert math.isclose( assert math.isclose(baseline[key], distributed[key]), (
baseline[key], distributed[key] f"Baseline and Distributed are not the same for key {key}:\n\tBaseline: {baseline[key]}\n\tDistributed: {distributed[key]}\n"
), f"Baseline and Distributed are not the same for key {key}:\n\tBaseline: {baseline[key]}\n\tDistributed: {distributed[key]}\n" )
def test_gather_for_metrics_with_non_tensor_objects_iterable_dataset(): def test_gather_for_metrics_with_non_tensor_objects_iterable_dataset():
@ -235,9 +235,9 @@ def test_gather_for_metrics_drop_last():
# Should return a full set of complete batches from each GPU # Should return a full set of complete batches from each GPU
num_expected_items = per_device_batch_size * accelerator.num_processes num_expected_items = per_device_batch_size * accelerator.num_processes
assert gathered_items.size(0) == ( assert gathered_items.size(0) == (num_expected_items), (
num_expected_items f"Expected number of items: {num_expected_items}, Actual: {gathered_items.size(0)}"
), f"Expected number of items: {num_expected_items}, Actual: {gathered_items.size(0)}" )
def main(): def main():

View File

@ -255,9 +255,9 @@ def training_function(config, args):
) )
train_total_peak_memory[f"epoch-{epoch}"] = tracemalloc.peaked + b2mb(tracemalloc.begin) train_total_peak_memory[f"epoch-{epoch}"] = tracemalloc.peaked + b2mb(tracemalloc.begin)
if args.peak_memory_upper_bound is not None: if args.peak_memory_upper_bound is not None:
assert ( assert train_total_peak_memory[f"epoch-{epoch}"] <= args.peak_memory_upper_bound, (
train_total_peak_memory[f"epoch-{epoch}"] <= args.peak_memory_upper_bound "Peak memory usage exceeded the upper bound"
), "Peak memory usage exceeded the upper bound" )
accelerator.wait_for_everyone() accelerator.wait_for_everyone()
if accelerator.is_main_process: if accelerator.is_main_process:

View File

@ -161,9 +161,9 @@ def training_function(config, args):
and linear_decay_scheduler and linear_decay_scheduler
and accelerator.state.mixed_precision == "no" and accelerator.state.mixed_precision == "no"
): ):
assert ( assert lr_scheduler.get_last_lr()[0] == expected_lr_after_first_optim_step, (
lr_scheduler.get_last_lr()[0] == expected_lr_after_first_optim_step f"Wrong lr found at second step, expected {expected_lr_after_first_optim_step}, got {lr_scheduler.get_last_lr()[0]}"
), f"Wrong lr found at second step, expected {expected_lr_after_first_optim_step}, got {lr_scheduler.get_last_lr()[0]}" )
lr_scheduler_check_completed = True lr_scheduler_check_completed = True
model.eval() model.eval()
@ -199,14 +199,14 @@ def training_function(config, args):
# check that the LR is 0 # check that the LR is 0
if linear_decay_scheduler and accelerator.state.mixed_precision == "no": if linear_decay_scheduler and accelerator.state.mixed_precision == "no":
assert ( assert lr_scheduler.get_last_lr()[0] == 0, (
lr_scheduler.get_last_lr()[0] == 0 f"Wrong lr found at last step, expected 0, got {lr_scheduler.get_last_lr()[0]}"
), f"Wrong lr found at last step, expected 0, got {lr_scheduler.get_last_lr()[0]}" )
if args.performance_lower_bound is not None: if args.performance_lower_bound is not None:
assert ( assert args.performance_lower_bound <= best_performance, (
args.performance_lower_bound <= best_performance f"Best performance metric {best_performance} is lower than the lower bound {args.performance_lower_bound}"
), f"Best performance metric {best_performance} is lower than the lower bound {args.performance_lower_bound}" )
accelerator.wait_for_everyone() accelerator.wait_for_everyone()
if accelerator.is_main_process: if accelerator.is_main_process:
@ -216,9 +216,9 @@ def training_function(config, args):
# Finally try saving the model # Finally try saving the model
accelerator.save_model(model, args.output_dir) accelerator.save_model(model, args.output_dir)
accelerator.wait_for_everyone() accelerator.wait_for_everyone()
assert Path( assert Path(args.output_dir, SAFE_WEIGHTS_NAME).exists(), (
args.output_dir, SAFE_WEIGHTS_NAME "Model was not saved when calling `Accelerator.save_model`"
).exists(), "Model was not saved when calling `Accelerator.save_model`" )
accelerator.end_training() accelerator.end_training()

View File

@ -270,9 +270,9 @@ def test_data_loader(data_loader, accelerator):
sorted_all_examples = sorted(all_examples) sorted_all_examples = sorted(all_examples)
# Check if all elements are present in the sorted list of iterated samples # Check if all elements are present in the sorted list of iterated samples
assert ( assert len(set(sorted_all_examples)) == NUM_ELEMENTS, (
len(set(sorted_all_examples)) == NUM_ELEMENTS "Not all the dataset elements have been iterated in an epoch due to duplication of samples across processes."
), "Not all the dataset elements have been iterated in an epoch due to duplication of samples across processes." )
def test_stateful_dataloader(accelerator): def test_stateful_dataloader(accelerator):

View File

@ -112,9 +112,9 @@ def process_execution_check():
assert text.startswith("Currently in the main process\n"), "Main process was not first" assert text.startswith("Currently in the main process\n"), "Main process was not first"
if num_processes > 1: if num_processes > 1:
assert text.endswith("Now on another process\n"), "Main process was not first" assert text.endswith("Now on another process\n"), "Main process was not first"
assert ( assert text.count("Now on another process\n") == accelerator.num_processes - 1, (
text.count("Now on another process\n") == accelerator.num_processes - 1 f"Only wrote to file {text.count('Now on another process') + 1} times, not {accelerator.num_processes}"
), f"Only wrote to file {text.count('Now on another process') + 1} times, not {accelerator.num_processes}" )
except AssertionError: except AssertionError:
path.unlink() path.unlink()
raise raise
@ -351,13 +351,13 @@ def custom_sampler_check():
dl = prepare_data_loader(dl, state.device, state.num_processes, state.process_index) dl = prepare_data_loader(dl, state.device, state.num_processes, state.process_index)
# We need just ensure that `dl.batch_sampler` (or `dl.batch_sampler.batch_sampler` is indeed the old batch sampler # We need just ensure that `dl.batch_sampler` (or `dl.batch_sampler.batch_sampler` is indeed the old batch sampler
if hasattr(dl.batch_sampler, "batch_sampler"): if hasattr(dl.batch_sampler, "batch_sampler"):
assert isinstance( assert isinstance(dl.batch_sampler.batch_sampler, CustomBatchSampler), (
dl.batch_sampler.batch_sampler, CustomBatchSampler "Custom sampler was changed after calling `prepare_data_loader`"
), "Custom sampler was changed after calling `prepare_data_loader`" )
else: else:
assert isinstance( assert isinstance(dl.batch_sampler, CustomBatchSampler), (
dl.batch_sampler, CustomBatchSampler "Custom sampler was changed after calling `prepare_data_loader`"
), "Custom sampler was changed after calling `prepare_data_loader`" )
def check_seedable_sampler(): def check_seedable_sampler():
@ -400,9 +400,9 @@ def check_seedable_sampler_in_batch_sampler_shard():
) )
target_sampler = prepared_data_loader.batch_sampler.batch_sampler.sampler target_sampler = prepared_data_loader.batch_sampler.batch_sampler.sampler
assert isinstance( assert isinstance(target_sampler, SeedableRandomSampler), (
target_sampler, SeedableRandomSampler "Sampler in BatchSamplerShard is not SeedableRandomSampler."
), "Sampler in BatchSamplerShard is not SeedableRandomSampler." )
def check_seedable_sampler_with_data_seed(): def check_seedable_sampler_with_data_seed():
@ -666,31 +666,31 @@ def test_split_between_processes_dataset(datasets_Dataset):
state = AcceleratorState() state = AcceleratorState()
data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes)]) data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes)])
with state.split_between_processes(data, apply_padding=False) as results: with state.split_between_processes(data, apply_padding=False) as results:
assert ( assert len(results) == 2, (
len(results) == 2 f"Each process did not have two items. Process index: {state.process_index}; Length: {len(results)}"
), f"Each process did not have two items. Process index: {state.process_index}; Length: {len(results)}" )
data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes - 1)]) data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes - 1)])
with state.split_between_processes(data, apply_padding=False) as results: with state.split_between_processes(data, apply_padding=False) as results:
if state.is_last_process: if state.is_last_process:
assert ( assert len(results) == 1, (
len(results) == 1 f"Last process did not receive a single item. Process index: {state.process_index}; Length: {len(results)}"
), f"Last process did not receive a single item. Process index: {state.process_index}; Length: {len(results)}" )
else: else:
assert ( assert len(results) == 2, (
len(results) == 2 f"One of the intermediate processes did not receive two items. Process index: {state.process_index}; Length: {len(results)}"
), f"One of the intermediate processes did not receive two items. Process index: {state.process_index}; Length: {len(results)}" )
data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes - 1)]) data = datasets_Dataset.from_list([dict(k=v) for v in range(2 * state.num_processes - 1)])
with state.split_between_processes(data, apply_padding=True) as results: with state.split_between_processes(data, apply_padding=True) as results:
if state.num_processes == 1: if state.num_processes == 1:
assert ( assert len(results) == 1, (
len(results) == 1 f"Single process did not receive a single item. Process index: {state.process_index}; Length: {len(results)}"
), f"Single process did not receive a single item. Process index: {state.process_index}; Length: {len(results)}" )
else: else:
assert ( assert len(results) == 2, (
len(results) == 2 f"Each process did not have two items. Process index: {state.process_index}; Length: {len(results)}"
), f"Each process did not have two items. Process index: {state.process_index}; Length: {len(results)}" )
state.wait_for_everyone() state.wait_for_everyone()
@ -699,18 +699,18 @@ def test_split_between_processes_list():
state = AcceleratorState() state = AcceleratorState()
data = list(range(0, 2 * state.num_processes)) data = list(range(0, 2 * state.num_processes))
with state.split_between_processes(data) as results: with state.split_between_processes(data) as results:
assert ( assert len(results) == 2, (
len(results) == 2 f"Each process did not have two items. Process index: {state.process_index}; Length: {len(results)}"
), f"Each process did not have two items. Process index: {state.process_index}; Length: {len(results)}" )
data = list(range(0, (3 * state.num_processes) - 1)) data = list(range(0, (3 * state.num_processes) - 1))
with state.split_between_processes(data, apply_padding=True) as results: with state.split_between_processes(data, apply_padding=True) as results:
if state.is_last_process: if state.is_last_process:
# Test that the last process gets the extra item(s) # Test that the last process gets the extra item(s)
num_samples_per_device = math.ceil(len(data) / state.num_processes) num_samples_per_device = math.ceil(len(data) / state.num_processes)
assert ( assert len(results) == num_samples_per_device, (
len(results) == num_samples_per_device f"Last process did not get the extra item(s). Process index: {state.process_index}; Length: {len(results)}"
), f"Last process did not get the extra item(s). Process index: {state.process_index}; Length: {len(results)}" )
state.wait_for_everyone() state.wait_for_everyone()
@ -737,17 +737,17 @@ def test_split_between_processes_nested_dict():
elif state.process_index == 3: elif state.process_index == 3:
assert results["b"] == data_copy["b"][-2:] assert results["b"] == data_copy["b"][-2:]
if state.process_index == 0: if state.process_index == 0:
assert torch.allclose( assert torch.allclose(results["c"], data_copy["c"][: 8 // state.num_processes]), (
results["c"], data_copy["c"][: 8 // state.num_processes] f"Did not obtain expected values on process 0, expected `{data['c'][: 8 // state.num_processes]}`, received: {results['c']}"
), f"Did not obtain expected values on process 0, expected `{data['c'][: 8 // state.num_processes]}`, received: {results['c']}" )
elif state.num_processes == 2: elif state.num_processes == 2:
assert torch.allclose( assert torch.allclose(results["c"], data_copy["c"][4:]), (
results["c"], data_copy["c"][4:] f"Did not obtain expected values on process 2, expected `{data['c'][4:]}`, received: {results['c']}"
), f"Did not obtain expected values on process 2, expected `{data['c'][4:]}`, received: {results['c']}" )
elif state.process_index == 3: elif state.process_index == 3:
assert torch.allclose( assert torch.allclose(results["c"], data_copy["c"][-2:]), (
results["c"], data_copy["c"][-2:] f"Did not obtain expected values on process 4, expected `{data['c'][-2:]}`, received: {results['c']}"
), f"Did not obtain expected values on process 4, expected `{data['c'][-2:]}`, received: {results['c']}" )
state.wait_for_everyone() state.wait_for_everyone()
@ -773,13 +773,13 @@ def test_split_between_processes_evenly():
num_extras = len(data) % state.num_processes num_extras = len(data) % state.num_processes
with state.split_between_processes(data) as results: with state.split_between_processes(data) as results:
if state.process_index < num_extras: if state.process_index < num_extras:
assert ( assert len(results) == num_samples_per_process + 1, (
len(results) == num_samples_per_process + 1 f"Each Process should have even elements. Expected: {num_samples_per_process + 1}, Actual: {len(results)}"
), f"Each Process should have even elements. Expected: {num_samples_per_process + 1}, Actual: {len(results)}" )
else: else:
assert ( assert len(results) == num_samples_per_process, (
len(results) == num_samples_per_process f"Each Process should have even elements. Expected: {num_samples_per_process}, Actual: {len(results)}"
), f"Each Process should have even elements. Expected: {num_samples_per_process}, Actual: {len(results)}" )
state.wait_for_everyone() state.wait_for_everyone()

View File

@ -32,14 +32,14 @@ def check_model_parameters(model_a, model_b, did_step, iteration, **kwargs):
continue continue
if not did_step: if not did_step:
# Grads should not be in sync # Grads should not be in sync
assert ( assert torch.allclose(param.grad, grad_param.grad, **kwargs) is False, (
torch.allclose(param.grad, grad_param.grad, **kwargs) is False f"Gradients in sync when they should not be at iteration {iteration}:\nmodel_a grad ({param.grad}) == model_b grad ({grad_param.grad})"
), f"Gradients in sync when they should not be at iteration {iteration}:\nmodel_a grad ({param.grad}) == model_b grad ({grad_param.grad})" )
else: else:
# Grads should be in sync # Grads should be in sync
assert ( assert torch.allclose(param.grad, grad_param.grad, **kwargs) is True, (
torch.allclose(param.grad, grad_param.grad, **kwargs) is True f"Gradients not in sync when they should be at iteration {iteration}:\nmodel_a grad ({param.grad}) != model_b grad ({grad_param.grad})"
), f"Gradients not in sync when they should be at iteration {iteration}:\nmodel_a grad ({param.grad}) != model_b grad ({grad_param.grad})" )
def step_model(model, input, target, accelerator, do_backward=True): def step_model(model, input, target, accelerator, do_backward=True):
@ -101,9 +101,9 @@ def test_noop_sync(accelerator):
for param, ddp_param in zip(model.parameters(), ddp_model.parameters()): for param, ddp_param in zip(model.parameters(), ddp_model.parameters()):
if not param.requires_grad: if not param.requires_grad:
continue continue
assert torch.allclose( assert torch.allclose(param.grad, ddp_param.grad), (
param.grad, ddp_param.grad f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
), f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})" )
# Shuffle ddp_input on each iteration # Shuffle ddp_input on each iteration
torch.manual_seed(1337 + iteration) torch.manual_seed(1337 + iteration)
@ -136,14 +136,14 @@ def test_distributed_sync(accelerator):
continue continue
if iteration % 2 == 0: if iteration % 2 == 0:
# Grads should not be in sync # Grads should not be in sync
assert ( assert torch.allclose(param.grad, ddp_param.grad) is False, (
torch.allclose(param.grad, ddp_param.grad) is False f"Gradients in sync when they should not be:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})"
), f"Gradients in sync when they should not be:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})" )
else: else:
# Grads should be in sync # Grads should be in sync
assert ( assert torch.allclose(param.grad, ddp_param.grad) is True, (
torch.allclose(param.grad, ddp_param.grad) is True f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
), f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})" )
# Shuffle ddp_input on each iteration # Shuffle ddp_input on each iteration
torch.manual_seed(1337 + iteration) torch.manual_seed(1337 + iteration)
@ -185,9 +185,9 @@ def test_distributed_sync_multiple_fwd(accelerator):
if not param.requires_grad: if not param.requires_grad:
continue continue
# Grads should not be in sync # Grads should not be in sync
assert ( assert torch.allclose(param.grad, ddp_param.grad) is False, (
torch.allclose(param.grad, ddp_param.grad) is False f"Gradients in sync when they should not be:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})"
), f"Gradients in sync when they should not be:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})" )
else: else:
# Sync grads if last backward # Sync grads if last backward
@ -199,9 +199,9 @@ def test_distributed_sync_multiple_fwd(accelerator):
if not param.requires_grad: if not param.requires_grad:
continue continue
# Grads should be in sync # Grads should be in sync
assert ( assert torch.allclose(param.grad, ddp_param.grad) is True, (
torch.allclose(param.grad, ddp_param.grad) is True f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
), f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})" )
def test_gradient_accumulation(split_batches=False, dispatch_batches=False, sync_each_batch=False): def test_gradient_accumulation(split_batches=False, dispatch_batches=False, sync_each_batch=False):
@ -230,14 +230,14 @@ def test_gradient_accumulation(split_batches=False, dispatch_batches=False, sync
continue continue
if ((iteration + 1) % 2 == 0) or (iteration == len(dataloader) - 1) or sync_each_batch: if ((iteration + 1) % 2 == 0) or (iteration == len(dataloader) - 1) or sync_each_batch:
# Grads should be in sync # Grads should be in sync
assert ( assert torch.allclose(param.grad, ddp_param.grad) is True, (
torch.allclose(param.grad, ddp_param.grad) is True f"Gradients not in sync when they should be at iteration {iteration}:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
), f"Gradients not in sync when they should be at iteration {iteration}:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})" )
else: else:
# Grads should not be in sync # Grads should not be in sync
assert ( assert torch.allclose(param.grad, ddp_param.grad) is False, (
torch.allclose(param.grad, ddp_param.grad) is False f"Gradients in sync when they should not be at iteration {iteration}:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})"
), f"Gradients in sync when they should not be at iteration {iteration}:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})" )
# Shuffle ddp_input on each iteration # Shuffle ddp_input on each iteration
torch.manual_seed(1337 + iteration) torch.manual_seed(1337 + iteration)
@ -281,9 +281,9 @@ def test_gradient_accumulation_with_opt_and_scheduler(
ddp_sched.step() ddp_sched.step()
# Learning rates should be the same # Learning rates should be the same
assert ( assert opt.param_groups[0]["lr"] == ddp_opt.param_groups[0]["lr"], (
opt.param_groups[0]["lr"] == ddp_opt.param_groups[0]["lr"] f"Learning rates found in each optimizer did not align\nopt: {opt.param_groups[0]['lr']}\nDDP opt: {ddp_opt.param_groups[0]['lr']}\n"
), f"Learning rates found in each optimizer did not align\nopt: {opt.param_groups[0]['lr']}\nDDP opt: {ddp_opt.param_groups[0]['lr']}\n" )
did_step = (((iteration + 1) % 2) == 0) or ((iteration + 1) == len(dataloader)) did_step = (((iteration + 1) % 2) == 0) or ((iteration + 1) == len(dataloader))
if accelerator.num_processes > 1: if accelerator.num_processes > 1:
check_model_parameters( check_model_parameters(

View File

@ -177,9 +177,9 @@ class FSDPPluginIntegration(AccelerateTestCase):
env["FSDP_BACKWARD_PREFETCH"] = prefetch_policy env["FSDP_BACKWARD_PREFETCH"] = prefetch_policy
with patch_environment(**env), ctx as cm: with patch_environment(**env), ctx as cm:
fsdp_plugin = FullyShardedDataParallelPlugin() fsdp_plugin = FullyShardedDataParallelPlugin()
assert ( assert fsdp_plugin.backward_prefetch == expected_value, (
fsdp_plugin.backward_prefetch == expected_value f"Actual: {fsdp_plugin.backward_prefetch} != Expected: {expected_value}"
), f"Actual: {fsdp_plugin.backward_prefetch} != Expected: {expected_value}" )
if cm: if cm:
self.assertTrue(any(_warning_message_fsdp2 in out for out in cm.output)) self.assertTrue(any(_warning_message_fsdp2 in out for out in cm.output))

View File

@ -439,24 +439,24 @@ class AcceleratorTester(AccelerateTestCase):
model, optimizer, scheduler, train_dl, valid_dl, dummy_obj = accelerator.prepare( model, optimizer, scheduler, train_dl, valid_dl, dummy_obj = accelerator.prepare(
model, optimizer, scheduler, train_dl, valid_dl, dummy_obj model, optimizer, scheduler, train_dl, valid_dl, dummy_obj
) )
assert ( assert getattr(dummy_obj, "_is_accelerate_prepared", False) is False, (
getattr(dummy_obj, "_is_accelerate_prepared", False) is False "Dummy object should have `_is_accelerate_prepared` set to `True`"
), "Dummy object should have `_is_accelerate_prepared` set to `True`" )
assert ( assert getattr(model, "_is_accelerate_prepared", False) is True, (
getattr(model, "_is_accelerate_prepared", False) is True "Model is missing `_is_accelerator_prepared` or is set to `False`"
), "Model is missing `_is_accelerator_prepared` or is set to `False`" )
assert ( assert getattr(optimizer, "_is_accelerate_prepared", False) is True, (
getattr(optimizer, "_is_accelerate_prepared", False) is True "Optimizer is missing `_is_accelerator_prepared` or is set to `False`"
), "Optimizer is missing `_is_accelerator_prepared` or is set to `False`" )
assert ( assert getattr(scheduler, "_is_accelerate_prepared", False) is True, (
getattr(scheduler, "_is_accelerate_prepared", False) is True "Scheduler is missing `_is_accelerator_prepared` or is set to `False`"
), "Scheduler is missing `_is_accelerator_prepared` or is set to `False`" )
assert ( assert getattr(train_dl, "_is_accelerate_prepared", False) is True, (
getattr(train_dl, "_is_accelerate_prepared", False) is True "Train Dataloader is missing `_is_accelerator_prepared` or is set to `False`"
), "Train Dataloader is missing `_is_accelerator_prepared` or is set to `False`" )
assert ( assert getattr(valid_dl, "_is_accelerate_prepared", False) is True, (
getattr(valid_dl, "_is_accelerate_prepared", False) is True "Valid Dataloader is missing `_is_accelerator_prepared` or is set to `False`"
), "Valid Dataloader is missing `_is_accelerator_prepared` or is set to `False`" )
@require_cuda_or_xpu @require_cuda_or_xpu
@slow @slow

View File

@ -498,16 +498,16 @@ class ModelEstimatorTester(unittest.TestCase):
total_training_size_estimate = total_size_estimate * 4 total_training_size_estimate = total_size_estimate * 4
assert precision_str == output[i][0], f"Output is missing precision `{precision_str}`" assert precision_str == output[i][0], f"Output is missing precision `{precision_str}`"
assert ( assert largest_layer_estimate == output[i][1], (
largest_layer_estimate == output[i][1] f"Calculation for largest layer size in `{precision_str}` is incorrect."
), f"Calculation for largest layer size in `{precision_str}` is incorrect." )
assert ( assert total_size_estimate == output[i][2], (
total_size_estimate == output[i][2] f"Calculation for total size in `{precision_str}` is incorrect."
), f"Calculation for total size in `{precision_str}` is incorrect." )
assert total_training_size_estimate == max( assert total_training_size_estimate == max(output[i][3].values()), (
output[i][3].values() f"Calculation for total training size in `{precision_str}` is incorrect."
), f"Calculation for total training size in `{precision_str}` is incorrect." )
@require_transformers @require_transformers
def test_transformers_model(self): def test_transformers_model(self):
@ -515,12 +515,12 @@ class ModelEstimatorTester(unittest.TestCase):
output = gather_data(args) output = gather_data(args)
# The largest layer and total size of the model in bytes # The largest layer and total size of the model in bytes
largest_layer, total_size = 90669056, 433249280 largest_layer, total_size = 90669056, 433249280
assert ( assert largest_layer == output[0][1], (
largest_layer == output[0][1] f"Calculation for largest layer size in `fp32` is incorrect, expected {largest_layer} but received {output[0][1]}"
), f"Calculation for largest layer size in `fp32` is incorrect, expected {largest_layer} but received {output[0][1]}" )
assert ( assert total_size == output[0][2], (
total_size == output[0][2] f"Calculation for total size in `fp32` is incorrect, expected {total_size} but received {output[0][2]}"
), f"Calculation for total size in `fp32` is incorrect, expected {total_size} but received {output[0][2]}" )
@require_transformers @require_transformers
def test_no_split_modules(self): def test_no_split_modules(self):
@ -538,12 +538,12 @@ class ModelEstimatorTester(unittest.TestCase):
output = gather_data(args) output = gather_data(args)
# The largest layer and total size of the model in bytes # The largest layer and total size of the model in bytes
largest_layer, total_size = 9437184, 102441032 largest_layer, total_size = 9437184, 102441032
assert ( assert largest_layer == output[0][1], (
largest_layer == output[0][1] f"Calculation for largest layer size in `fp32` is incorrect, expected {largest_layer} but received {output[0][1]}"
), f"Calculation for largest layer size in `fp32` is incorrect, expected {largest_layer} but received {output[0][1]}" )
assert ( assert total_size == output[0][2], (
total_size == output[0][2] f"Calculation for total size in `fp32` is incorrect, expected {total_size} but received {output[0][2]}"
), f"Calculation for total size in `fp32` is incorrect, expected {total_size} but received {output[0][2]}" )
class ToFSDP2Tester(unittest.TestCase): class ToFSDP2Tester(unittest.TestCase):

View File

@ -55,9 +55,9 @@ def can_convert_te_model():
def maintain_proper_deepspeed_config(expected_version): def maintain_proper_deepspeed_config(expected_version):
assert ( assert AcceleratorState().deepspeed_plugin.zero_stage == expected_version, (
AcceleratorState().deepspeed_plugin.zero_stage == expected_version f"Expected zero stage {expected_version} but got {AcceleratorState().deepspeed_plugin.zero_stage}"
), f"Expected zero stage {expected_version} but got {AcceleratorState().deepspeed_plugin.zero_stage}" )
def can_convert_ao_model(): def can_convert_ao_model():

View File

@ -33,13 +33,13 @@ def one_cycle_test(num_processes=2, step_scheduler_with_optimizer=True, split_ba
# Optimizer has stepped # Optimizer has stepped
scheduler.step() scheduler.step()
if step_scheduler_with_optimizer or (num_processes == 1): if step_scheduler_with_optimizer or (num_processes == 1):
assert ( assert scheduler.scheduler.last_epoch == num_processes, (
scheduler.scheduler.last_epoch == num_processes f"Last Epoch ({scheduler.scheduler.last_epoch}) != Num Processes ({num_processes})"
), f"Last Epoch ({scheduler.scheduler.last_epoch}) != Num Processes ({num_processes})" )
else: else:
assert ( assert scheduler.scheduler.last_epoch != num_processes, (
scheduler.scheduler.last_epoch != num_processes f"Last Epoch ({scheduler.scheduler.last_epoch}) == Num Processes ({num_processes})"
), f"Last Epoch ({scheduler.scheduler.last_epoch}) == Num Processes ({num_processes})" )
def lambda_test(num_processes=2, step_scheduler_with_optimizer=True, split_batches=False): def lambda_test(num_processes=2, step_scheduler_with_optimizer=True, split_batches=False):
@ -53,18 +53,18 @@ def lambda_test(num_processes=2, step_scheduler_with_optimizer=True, split_batch
optimizer._is_overflow = False optimizer._is_overflow = False
scheduler.step() scheduler.step()
expected_lr = 1 - (num_processes if (step_scheduler_with_optimizer and not split_batches) else 1) / 10 expected_lr = 1 - (num_processes if (step_scheduler_with_optimizer and not split_batches) else 1) / 10
assert ( assert scheduler.get_last_lr()[0] == expected_lr, (
scheduler.get_last_lr()[0] == expected_lr f"Wrong lr found at first step, expected {expected_lr}, got {scheduler.get_last_lr()[0]}"
), f"Wrong lr found at first step, expected {expected_lr}, got {scheduler.get_last_lr()[0]}" )
# Optimizer has not stepped # Optimizer has not stepped
optimizer._is_overflow = True optimizer._is_overflow = True
scheduler.step() scheduler.step()
if not step_scheduler_with_optimizer: if not step_scheduler_with_optimizer:
expected_lr = 1 - 2 / 10 expected_lr = 1 - 2 / 10
assert ( assert scheduler.get_last_lr()[0] == expected_lr, (
scheduler.get_last_lr()[0] == expected_lr f"Wrong lr found at second step, expected {expected_lr}, got {scheduler.get_last_lr()[0]}"
), f"Wrong lr found at second step, expected {expected_lr}, got {scheduler.get_last_lr()[0]}" )
def accumulation_test(num_processes: int = 2): def accumulation_test(num_processes: int = 2):
@ -92,12 +92,12 @@ def accumulation_test(num_processes: int = 2):
scheduler.step() scheduler.step()
if i == (10 * num_steps - 2): if i == (10 * num_steps - 2):
assert ( assert scheduler.get_last_lr()[0] != 0, (
scheduler.get_last_lr()[0] != 0 f"Wrong lr found at second-to-last step, expected non-zero, got {scheduler.get_last_lr()[0]}. num_steps: {num_steps}"
), f"Wrong lr found at second-to-last step, expected non-zero, got {scheduler.get_last_lr()[0]}. num_steps: {num_steps}" )
assert ( assert scheduler.get_last_lr()[0] == 0, (
scheduler.get_last_lr()[0] == 0 f"Wrong lr found at last step, expected 0, got {scheduler.get_last_lr()[0]}"
), f"Wrong lr found at last step, expected 0, got {scheduler.get_last_lr()[0]}" )
GradientState._reset_state() GradientState._reset_state()

View File

@ -421,9 +421,9 @@ if __name__ == "__main__":
for group in optimizer.param_groups: for group in optimizer.param_groups:
param_device = group["params"][0].device param_device = group["params"][0].device
break break
assert ( assert param_device.type == torch.device("cpu").type, (
param_device.type == torch.device("cpu").type f"Loaded optimizer states did not match, expected to be loaded on the CPU but got {param_device}"
), f"Loaded optimizer states did not match, expected to be loaded on the CPU but got {param_device}" )
# Check device state # Check device state
model.to(accelerator.device) model.to(accelerator.device)
@ -431,9 +431,9 @@ if __name__ == "__main__":
for group in optimizer.param_groups: for group in optimizer.param_groups:
param_device = group["params"][0].device param_device = group["params"][0].device
break break
assert ( assert param_device.type == accelerator.device.type, (
param_device.type == accelerator.device.type f"Loaded optimizer states did not match, expected to be loaded on {accelerator.device} but got {param_device}"
), f"Loaded optimizer states did not match, expected to be loaded on {accelerator.device} but got {param_device}" )
# Check error # Check error
with pytest.raises(TypeError, match="Unsupported optimizer map location passed"): with pytest.raises(TypeError, match="Unsupported optimizer map location passed"):

View File

@ -40,9 +40,7 @@ def parse_args():
""" """
parser = ArgumentParser( parser = ArgumentParser(
description=( description=(
"PyTorch TPU distributed training launch " "PyTorch TPU distributed training launch helper utility that will spawn up multiple distributed processes"
"helper utility that will spawn up "
"multiple distributed processes"
) )
) )

View File

@ -56,7 +56,7 @@ for log in Path().glob("*.log"):
if line.get("nodeid", "") != "": if line.get("nodeid", "") != "":
test = line["nodeid"] test = line["nodeid"]
if line.get("duration", None) is not None: if line.get("duration", None) is not None:
duration = f'{line["duration"]:.4f}' duration = f"{line['duration']:.4f}"
if line.get("outcome", "") == "failed": if line.get("outcome", "") == "failed":
section_num_failed += 1 section_num_failed += 1
failed.append([test, duration, log.name.split("_")[0]]) failed.append([test, duration, log.name.split("_")[0]])
@ -136,7 +136,7 @@ if os.environ.get("TEST_TYPE", "") != "":
"text": "Check Action results", "text": "Check Action results",
"emoji": True, "emoji": True,
}, },
"url": f'https://github.com/{os.environ["GITHUB_REPOSITORY"]}/actions/runs/{os.environ["GITHUB_RUN_ID"]}', "url": f"https://github.com/{os.environ['GITHUB_REPOSITORY']}/actions/runs/{os.environ['GITHUB_RUN_ID']}",
}, },
} }
payload.append(action_button) payload.append(action_button)