ENH: Method comparison improve logging (#2591)

- Print early how the experiment is categorized
- Last resort save_dir so that results are not lost
- Catch errors in general, not only OOM
- Log error message
- Catch checkpoint saving in try ... except, just in case (otherwise,
  if it fails, no logs are written)
This commit is contained in:
Benjamin Bossan
2025-06-17 12:14:56 +02:00
committed by GitHub
parent a27406c26d
commit d6dbbc9195
2 changed files with 50 additions and 28 deletions

View File

@ -48,6 +48,7 @@ from utils import (
get_dataset_info,
get_model,
get_optimizer_and_scheduler,
get_peft_branch,
get_tokenizer,
get_train_config,
init_cuda,
@ -163,6 +164,7 @@ def train(
status = TrainStatus.FAILED
tic_train = time.perf_counter()
eval_time = 0.0
error_msg = ""
ds_train, ds_valid, ds_test = get_train_valid_test_datasets(
tokenizer=tokenizer, query_template=query_template, print_fn=print_verbose
@ -318,10 +320,16 @@ def train(
except KeyboardInterrupt:
print_verbose("canceled training")
status = TrainStatus.CANCELED
except torch.OutOfMemoryError:
error_msg = "manually canceled"
except torch.OutOfMemoryError as exc:
# ouch, still let's try to log some results
print_verbose("out of memory error encountered")
status = TrainStatus.CANCELED
error_msg = str(exc)
except Exception as exc:
print_verbose(f"encountered an error: {exc}")
status = TrainStatus.CANCELED
error_msg = str(exc)
toc_train = time.perf_counter()
train_time = toc_train - tic_train - eval_time
@ -334,6 +342,7 @@ def train(
cuda_memory_reserved_log=cuda_memory_reserved_log,
losses=losses,
metrics=metrics,
error_msg=error_msg,
)
return train_result
@ -342,6 +351,14 @@ def main(*, path_experiment: str, experiment_name: str, clean: bool) -> None:
tic_total = time.perf_counter()
start_date = dt.datetime.now(tz=dt.timezone.utc).replace(microsecond=0).isoformat()
peft_branch = get_peft_branch()
if peft_branch == "main":
print_verbose("===== This experiment is categorized as a MAIN run because the PEFT branch is 'main' ======")
else:
print_verbose(
f"===== This experiment is categorized as a TEST run because the PEFT branch is '{peft_branch}' ======"
)
# load configs
peft_config = PeftConfig.from_pretrained(path_experiment)
path_train_config = os.path.join(path_experiment, FILE_NAME_TRAIN_PARAMS)
@ -366,39 +383,38 @@ def main(*, path_experiment: str, experiment_name: str, clean: bool) -> None:
print_verbose(model)
# train model
try:
train_result = train(
model=model,
max_steps=train_config.max_steps,
batch_size=train_config.batch_size,
batch_size_eval=train_config.batch_size_eval,
tokenizer=tokenizer,
cuda_memory_init=cuda_memory_init,
eval_steps=train_config.eval_steps,
generation_kwargs=train_config.generation_kwargs,
grad_norm_clip=train_config.grad_norm_clip,
optimizer_type=train_config.optimizer_type,
optimizer_kwargs=train_config.optimizer_kwargs,
query_template=train_config.query_template,
lr_scheduler_arg=train_config.lr_scheduler,
use_amp=train_config.use_amp,
is_adalora=isinstance(peft_config, AdaLoraConfig),
)
except Exception as e:
print_verbose(f"Training failed with error: {e}")
raise
train_result = train(
model=model,
max_steps=train_config.max_steps,
batch_size=train_config.batch_size,
batch_size_eval=train_config.batch_size_eval,
tokenizer=tokenizer,
cuda_memory_init=cuda_memory_init,
eval_steps=train_config.eval_steps,
generation_kwargs=train_config.generation_kwargs,
grad_norm_clip=train_config.grad_norm_clip,
optimizer_type=train_config.optimizer_type,
optimizer_kwargs=train_config.optimizer_kwargs,
query_template=train_config.query_template,
lr_scheduler_arg=train_config.lr_scheduler,
use_amp=train_config.use_amp,
is_adalora=isinstance(peft_config, AdaLoraConfig),
)
if train_result.status == TrainStatus.FAILED:
print_verbose("Training failed, not logging results")
sys.exit(1)
# save the model in temp dir, get file size, clean it up afterwards if clean is passed
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True, delete=clean) as tmp_dir:
model.save_pretrained(tmp_dir)
stat = os.stat(os.path.join(tmp_dir, SAFETENSORS_WEIGHTS_NAME))
file_size = stat.st_size
if not clean:
print_verbose(f"Saved PEFT checkpoint to {tmp_dir}")
try:
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True, delete=clean) as tmp_dir:
model.save_pretrained(tmp_dir)
stat = os.stat(os.path.join(tmp_dir, SAFETENSORS_WEIGHTS_NAME))
file_size = stat.st_size
if not clean:
print_verbose(f"Saved PEFT checkpoint to {tmp_dir}")
except Exception as exc:
print(f"Failed to save PEFT checkpoint due to the following error: {exc}")
time_total = time.perf_counter() - tic_total
# log results: print and save to file

View File

@ -21,6 +21,7 @@ import json
import os
import platform
import subprocess
import tempfile
import warnings
from dataclasses import asdict, dataclass
from decimal import Decimal, DivisionByZero, InvalidOperation
@ -545,6 +546,7 @@ class TrainResult:
cuda_memory_reserved_log: list[int]
losses: list[float]
metrics: list[Any] # TODO
error_msg: str
def log_to_console(log_data: dict[str, Any], print_fn: Callable[..., None]) -> None:
@ -621,6 +623,9 @@ def log_results(
elif train_result.status == TrainStatus.SUCCESS:
save_dir = RESULT_PATH
print_fn("Experiment run was categorized as successful run")
else:
save_dir = tempfile.mkdtemp()
print_fn(f"Experiment could not be categorized, writing results to {save_dir}. Please open an issue on PEFT.")
peft_config_dict = peft_config.to_dict()
for key, value in peft_config_dict.items():
@ -635,6 +640,7 @@ def log_results(
"peft_branch": peft_branch,
"train_config": asdict(train_config),
"peft_config": peft_config_dict,
"error_msg": train_result.error_msg,
},
"train_info": {
"cuda_memory_reserved_avg": cuda_memory_avg,