mirror of
https://github.com/huggingface/peft.git
synced 2025-10-20 15:33:48 +08:00
ENH: Method comparison improve logging (#2591)
- Print early how the experiment is categorized - Last resort save_dir so that results are not lost - Catch errors in general, not only OOM - Log error message - Catch checkpoint saving in try ... except, just in case (otherwise, if it fails, no logs are written)
This commit is contained in:
@ -48,6 +48,7 @@ from utils import (
|
||||
get_dataset_info,
|
||||
get_model,
|
||||
get_optimizer_and_scheduler,
|
||||
get_peft_branch,
|
||||
get_tokenizer,
|
||||
get_train_config,
|
||||
init_cuda,
|
||||
@ -163,6 +164,7 @@ def train(
|
||||
status = TrainStatus.FAILED
|
||||
tic_train = time.perf_counter()
|
||||
eval_time = 0.0
|
||||
error_msg = ""
|
||||
|
||||
ds_train, ds_valid, ds_test = get_train_valid_test_datasets(
|
||||
tokenizer=tokenizer, query_template=query_template, print_fn=print_verbose
|
||||
@ -318,10 +320,16 @@ def train(
|
||||
except KeyboardInterrupt:
|
||||
print_verbose("canceled training")
|
||||
status = TrainStatus.CANCELED
|
||||
except torch.OutOfMemoryError:
|
||||
error_msg = "manually canceled"
|
||||
except torch.OutOfMemoryError as exc:
|
||||
# ouch, still let's try to log some results
|
||||
print_verbose("out of memory error encountered")
|
||||
status = TrainStatus.CANCELED
|
||||
error_msg = str(exc)
|
||||
except Exception as exc:
|
||||
print_verbose(f"encountered an error: {exc}")
|
||||
status = TrainStatus.CANCELED
|
||||
error_msg = str(exc)
|
||||
|
||||
toc_train = time.perf_counter()
|
||||
train_time = toc_train - tic_train - eval_time
|
||||
@ -334,6 +342,7 @@ def train(
|
||||
cuda_memory_reserved_log=cuda_memory_reserved_log,
|
||||
losses=losses,
|
||||
metrics=metrics,
|
||||
error_msg=error_msg,
|
||||
)
|
||||
return train_result
|
||||
|
||||
@ -342,6 +351,14 @@ def main(*, path_experiment: str, experiment_name: str, clean: bool) -> None:
|
||||
tic_total = time.perf_counter()
|
||||
start_date = dt.datetime.now(tz=dt.timezone.utc).replace(microsecond=0).isoformat()
|
||||
|
||||
peft_branch = get_peft_branch()
|
||||
if peft_branch == "main":
|
||||
print_verbose("===== This experiment is categorized as a MAIN run because the PEFT branch is 'main' ======")
|
||||
else:
|
||||
print_verbose(
|
||||
f"===== This experiment is categorized as a TEST run because the PEFT branch is '{peft_branch}' ======"
|
||||
)
|
||||
|
||||
# load configs
|
||||
peft_config = PeftConfig.from_pretrained(path_experiment)
|
||||
path_train_config = os.path.join(path_experiment, FILE_NAME_TRAIN_PARAMS)
|
||||
@ -366,39 +383,38 @@ def main(*, path_experiment: str, experiment_name: str, clean: bool) -> None:
|
||||
print_verbose(model)
|
||||
|
||||
# train model
|
||||
try:
|
||||
train_result = train(
|
||||
model=model,
|
||||
max_steps=train_config.max_steps,
|
||||
batch_size=train_config.batch_size,
|
||||
batch_size_eval=train_config.batch_size_eval,
|
||||
tokenizer=tokenizer,
|
||||
cuda_memory_init=cuda_memory_init,
|
||||
eval_steps=train_config.eval_steps,
|
||||
generation_kwargs=train_config.generation_kwargs,
|
||||
grad_norm_clip=train_config.grad_norm_clip,
|
||||
optimizer_type=train_config.optimizer_type,
|
||||
optimizer_kwargs=train_config.optimizer_kwargs,
|
||||
query_template=train_config.query_template,
|
||||
lr_scheduler_arg=train_config.lr_scheduler,
|
||||
use_amp=train_config.use_amp,
|
||||
is_adalora=isinstance(peft_config, AdaLoraConfig),
|
||||
)
|
||||
except Exception as e:
|
||||
print_verbose(f"Training failed with error: {e}")
|
||||
raise
|
||||
train_result = train(
|
||||
model=model,
|
||||
max_steps=train_config.max_steps,
|
||||
batch_size=train_config.batch_size,
|
||||
batch_size_eval=train_config.batch_size_eval,
|
||||
tokenizer=tokenizer,
|
||||
cuda_memory_init=cuda_memory_init,
|
||||
eval_steps=train_config.eval_steps,
|
||||
generation_kwargs=train_config.generation_kwargs,
|
||||
grad_norm_clip=train_config.grad_norm_clip,
|
||||
optimizer_type=train_config.optimizer_type,
|
||||
optimizer_kwargs=train_config.optimizer_kwargs,
|
||||
query_template=train_config.query_template,
|
||||
lr_scheduler_arg=train_config.lr_scheduler,
|
||||
use_amp=train_config.use_amp,
|
||||
is_adalora=isinstance(peft_config, AdaLoraConfig),
|
||||
)
|
||||
|
||||
if train_result.status == TrainStatus.FAILED:
|
||||
print_verbose("Training failed, not logging results")
|
||||
sys.exit(1)
|
||||
|
||||
# save the model in temp dir, get file size, clean it up afterwards if clean is passed
|
||||
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True, delete=clean) as tmp_dir:
|
||||
model.save_pretrained(tmp_dir)
|
||||
stat = os.stat(os.path.join(tmp_dir, SAFETENSORS_WEIGHTS_NAME))
|
||||
file_size = stat.st_size
|
||||
if not clean:
|
||||
print_verbose(f"Saved PEFT checkpoint to {tmp_dir}")
|
||||
try:
|
||||
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True, delete=clean) as tmp_dir:
|
||||
model.save_pretrained(tmp_dir)
|
||||
stat = os.stat(os.path.join(tmp_dir, SAFETENSORS_WEIGHTS_NAME))
|
||||
file_size = stat.st_size
|
||||
if not clean:
|
||||
print_verbose(f"Saved PEFT checkpoint to {tmp_dir}")
|
||||
except Exception as exc:
|
||||
print(f"Failed to save PEFT checkpoint due to the following error: {exc}")
|
||||
|
||||
time_total = time.perf_counter() - tic_total
|
||||
# log results: print and save to file
|
||||
|
@ -21,6 +21,7 @@ import json
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
import tempfile
|
||||
import warnings
|
||||
from dataclasses import asdict, dataclass
|
||||
from decimal import Decimal, DivisionByZero, InvalidOperation
|
||||
@ -545,6 +546,7 @@ class TrainResult:
|
||||
cuda_memory_reserved_log: list[int]
|
||||
losses: list[float]
|
||||
metrics: list[Any] # TODO
|
||||
error_msg: str
|
||||
|
||||
|
||||
def log_to_console(log_data: dict[str, Any], print_fn: Callable[..., None]) -> None:
|
||||
@ -621,6 +623,9 @@ def log_results(
|
||||
elif train_result.status == TrainStatus.SUCCESS:
|
||||
save_dir = RESULT_PATH
|
||||
print_fn("Experiment run was categorized as successful run")
|
||||
else:
|
||||
save_dir = tempfile.mkdtemp()
|
||||
print_fn(f"Experiment could not be categorized, writing results to {save_dir}. Please open an issue on PEFT.")
|
||||
|
||||
peft_config_dict = peft_config.to_dict()
|
||||
for key, value in peft_config_dict.items():
|
||||
@ -635,6 +640,7 @@ def log_results(
|
||||
"peft_branch": peft_branch,
|
||||
"train_config": asdict(train_config),
|
||||
"peft_config": peft_config_dict,
|
||||
"error_msg": train_result.error_msg,
|
||||
},
|
||||
"train_info": {
|
||||
"cuda_memory_reserved_avg": cuda_memory_avg,
|
||||
|
Reference in New Issue
Block a user