mirror of
https://github.com/volcengine/verl.git
synced 2025-10-20 13:43:50 +08:00
[dev] fix: validation metrics (#1374)
### Checklist Before Starting - [x] Search for similar PR(s). ### What does this PR do? 1. Fix the error that `metric` is not added when `n == 1`. 2. Remove `std@1`. 3. Add assertation for doing initial validation but `val_metrics` is empty. ### Additional Info. - **Issue Number**: none - **Training**: none - **Inference**: none ### Checklist Before Submitting - [x] Read the [Contribute Guide](https://github.com/volcengine/verl?tab=readme-ov-file#contribution-guide). - [x] Apply [pre-commit checks](https://github.com/volcengine/verl?tab=readme-ov-file#code-linting-and-formatting). - [x] Add `[BREAKING]` to the PR title if it breaks any API. - [x] Update the documentation about your changes in the [docs](https://github.com/volcengine/verl/tree/main/docs). - [x] Add CI test(s) if necessary.
This commit is contained in:
committed by
GitHub
parent
1e47e412a4
commit
709796f849
@ -67,6 +67,7 @@ class RayDAPOTrainer(RayPPOTrainer):
|
||||
# currently, we only support validation using the reward_function.
|
||||
if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
|
||||
val_metrics = self._validate()
|
||||
assert val_metrics, f"{val_metrics=}"
|
||||
pprint(f"Initial validation metrics: {val_metrics}")
|
||||
logger.log(data=val_metrics, step=self.global_steps)
|
||||
if self.config.trainer.get("val_only", False):
|
||||
|
@ -327,6 +327,7 @@ class RayPRIMETrainer(RayPPOTrainer):
|
||||
# currently, we only support validation using the reward_function.
|
||||
if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
|
||||
val_metrics = self._validate()
|
||||
assert val_metrics, f"{val_metrics=}"
|
||||
pprint(f"Initial validation metrics: {val_metrics}")
|
||||
logger.log(data=val_metrics, step=self.global_steps)
|
||||
if self.config.trainer.get("val_only", False):
|
||||
|
@ -208,45 +208,42 @@ def process_validation_metrics(data_sources: list[str], sample_inputs: list[str]
|
||||
for var_name, var_vals in var2vals.items():
|
||||
if isinstance(var_vals[0], str):
|
||||
continue
|
||||
|
||||
metric = {}
|
||||
n_resps = len(var_vals)
|
||||
metric[f"mean@{n_resps}"] = np.mean(var_vals)
|
||||
metric[f"std@{n_resps}"] = np.std(var_vals)
|
||||
|
||||
ns = []
|
||||
n = 2
|
||||
while n < n_resps:
|
||||
ns.append(n)
|
||||
n *= 2
|
||||
ns.append(n_resps)
|
||||
|
||||
# If there are multiple responses, we can compute the best/worst-of-N metrics
|
||||
# If not, they are the same as the single response metrics
|
||||
if n_resps > 1:
|
||||
for n in ns:
|
||||
if n == n_resps:
|
||||
# Non-bootstrapped
|
||||
metric[f"best@{n}/mean"] = np.max(var_vals)
|
||||
metric[f"worst@{n}/mean"] = np.min(var_vals)
|
||||
if var2vals.get("pred", None) is not None:
|
||||
vote_data = [{"val": val, "pred": pred} for val, pred in zip(var_vals, var2vals["pred"])]
|
||||
metric[f"maj@{n}/mean"] = calc_maj_val(vote_data, vote_key="pred", val_key="val")
|
||||
else:
|
||||
# Bootstrapped
|
||||
[(bon_mean, bon_std), (won_mean, won_std)] = bootstrap_metric(data=var_vals, subset_size=n, reduce_fns=[np.max, np.min], seed=seed)
|
||||
metric[f"best@{n}/mean"], metric[f"best@{n}/std"] = bon_mean, bon_std
|
||||
metric[f"worst@{n}/mean"], metric[f"worst@{n}/std"] = won_mean, won_std
|
||||
if var2vals.get("pred", None) is not None:
|
||||
vote_data = [{"val": val, "pred": pred} for val, pred in zip(var_vals, var2vals["pred"])]
|
||||
[(maj_n_mean, maj_n_std)] = bootstrap_metric(
|
||||
data=vote_data,
|
||||
subset_size=n,
|
||||
reduce_fns=[partial(calc_maj_val, vote_key="pred", val_key="val")],
|
||||
seed=seed,
|
||||
)
|
||||
metric[f"maj@{n}/mean"], metric[f"maj@{n}/std"] = maj_n_mean, maj_n_std
|
||||
# n = n_resps
|
||||
metric[f"std@{n_resps}"] = np.std(var_vals)
|
||||
|
||||
data_src2prompt2var2metric[data_source][prompt][var_name] = metric
|
||||
metric[f"best@{n_resps}/mean"] = np.max(var_vals)
|
||||
metric[f"worst@{n_resps}/mean"] = np.min(var_vals)
|
||||
if var2vals.get("pred", None) is not None:
|
||||
vote_data = [{"val": val, "pred": pred} for val, pred in zip(var_vals, var2vals["pred"])]
|
||||
metric[f"maj@{n_resps}/mean"] = calc_maj_val(vote_data, vote_key="pred", val_key="val")
|
||||
# 1 < n < n_resps
|
||||
ns = []
|
||||
n = 2
|
||||
while n < n_resps:
|
||||
ns.append(n)
|
||||
n *= 2
|
||||
|
||||
for n in ns:
|
||||
[(bon_mean, bon_std), (won_mean, won_std)] = bootstrap_metric(data=var_vals, subset_size=n, reduce_fns=[np.max, np.min], seed=seed)
|
||||
metric[f"best@{n}/mean"], metric[f"best@{n}/std"] = bon_mean, bon_std
|
||||
metric[f"worst@{n}/mean"], metric[f"worst@{n}/std"] = won_mean, won_std
|
||||
if var2vals.get("pred", None) is not None:
|
||||
vote_data = [{"val": val, "pred": pred} for val, pred in zip(var_vals, var2vals["pred"])]
|
||||
[(maj_n_mean, maj_n_std)] = bootstrap_metric(
|
||||
data=vote_data,
|
||||
subset_size=n,
|
||||
reduce_fns=[partial(calc_maj_val, vote_key="pred", val_key="val")],
|
||||
seed=seed,
|
||||
)
|
||||
metric[f"maj@{n}/mean"], metric[f"maj@{n}/std"] = maj_n_mean, maj_n_std
|
||||
|
||||
data_src2prompt2var2metric[data_source][prompt][var_name] = metric
|
||||
|
||||
# Aggregate metrics across prompts
|
||||
data_src2var2metric2prompt_vals = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
|
||||
|
@ -869,6 +869,7 @@ class RayPPOTrainer:
|
||||
# currently, we only support validation using the reward_function.
|
||||
if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
|
||||
val_metrics = self._validate()
|
||||
assert val_metrics, f"{val_metrics=}"
|
||||
pprint(f"Initial validation metrics: {val_metrics}")
|
||||
logger.log(data=val_metrics, step=self.global_steps)
|
||||
if self.config.trainer.get("val_only", False):
|
||||
|
Reference in New Issue
Block a user