[dev] fix: validation metrics (#1374)

### Checklist Before Starting

- [x] Search for similar PR(s).

### What does this PR do?

1. Fix the error that `metric` is not added when `n == 1`.
2. Remove `std@1`.
3. Add assertation for doing initial validation but `val_metrics` is
empty.

### Additional Info.

- **Issue Number**: none
- **Training**: none
- **Inference**: none

### Checklist Before Submitting

- [x] Read the [Contribute
Guide](https://github.com/volcengine/verl?tab=readme-ov-file#contribution-guide).
- [x] Apply [pre-commit
checks](https://github.com/volcengine/verl?tab=readme-ov-file#code-linting-and-formatting).
- [x] Add `[BREAKING]` to the PR title if it breaks any API.
- [x] Update the documentation about your changes in the
[docs](https://github.com/volcengine/verl/tree/main/docs).
- [x] Add CI test(s) if necessary.
This commit is contained in:
Shawn/Yuxuan Tong
2025-05-05 00:06:53 +08:00
committed by GitHub
parent 1e47e412a4
commit 709796f849
4 changed files with 33 additions and 33 deletions

View File

@ -67,6 +67,7 @@ class RayDAPOTrainer(RayPPOTrainer):
# currently, we only support validation using the reward_function.
if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
val_metrics = self._validate()
assert val_metrics, f"{val_metrics=}"
pprint(f"Initial validation metrics: {val_metrics}")
logger.log(data=val_metrics, step=self.global_steps)
if self.config.trainer.get("val_only", False):

View File

@ -327,6 +327,7 @@ class RayPRIMETrainer(RayPPOTrainer):
# currently, we only support validation using the reward_function.
if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
val_metrics = self._validate()
assert val_metrics, f"{val_metrics=}"
pprint(f"Initial validation metrics: {val_metrics}")
logger.log(data=val_metrics, step=self.global_steps)
if self.config.trainer.get("val_only", False):

View File

@ -208,45 +208,42 @@ def process_validation_metrics(data_sources: list[str], sample_inputs: list[str]
for var_name, var_vals in var2vals.items():
if isinstance(var_vals[0], str):
continue
metric = {}
n_resps = len(var_vals)
metric[f"mean@{n_resps}"] = np.mean(var_vals)
metric[f"std@{n_resps}"] = np.std(var_vals)
ns = []
n = 2
while n < n_resps:
ns.append(n)
n *= 2
ns.append(n_resps)
# If there are multiple responses, we can compute the best/worst-of-N metrics
# If not, they are the same as the single response metrics
if n_resps > 1:
for n in ns:
if n == n_resps:
# Non-bootstrapped
metric[f"best@{n}/mean"] = np.max(var_vals)
metric[f"worst@{n}/mean"] = np.min(var_vals)
if var2vals.get("pred", None) is not None:
vote_data = [{"val": val, "pred": pred} for val, pred in zip(var_vals, var2vals["pred"])]
metric[f"maj@{n}/mean"] = calc_maj_val(vote_data, vote_key="pred", val_key="val")
else:
# Bootstrapped
[(bon_mean, bon_std), (won_mean, won_std)] = bootstrap_metric(data=var_vals, subset_size=n, reduce_fns=[np.max, np.min], seed=seed)
metric[f"best@{n}/mean"], metric[f"best@{n}/std"] = bon_mean, bon_std
metric[f"worst@{n}/mean"], metric[f"worst@{n}/std"] = won_mean, won_std
if var2vals.get("pred", None) is not None:
vote_data = [{"val": val, "pred": pred} for val, pred in zip(var_vals, var2vals["pred"])]
[(maj_n_mean, maj_n_std)] = bootstrap_metric(
data=vote_data,
subset_size=n,
reduce_fns=[partial(calc_maj_val, vote_key="pred", val_key="val")],
seed=seed,
)
metric[f"maj@{n}/mean"], metric[f"maj@{n}/std"] = maj_n_mean, maj_n_std
# n = n_resps
metric[f"std@{n_resps}"] = np.std(var_vals)
data_src2prompt2var2metric[data_source][prompt][var_name] = metric
metric[f"best@{n_resps}/mean"] = np.max(var_vals)
metric[f"worst@{n_resps}/mean"] = np.min(var_vals)
if var2vals.get("pred", None) is not None:
vote_data = [{"val": val, "pred": pred} for val, pred in zip(var_vals, var2vals["pred"])]
metric[f"maj@{n_resps}/mean"] = calc_maj_val(vote_data, vote_key="pred", val_key="val")
# 1 < n < n_resps
ns = []
n = 2
while n < n_resps:
ns.append(n)
n *= 2
for n in ns:
[(bon_mean, bon_std), (won_mean, won_std)] = bootstrap_metric(data=var_vals, subset_size=n, reduce_fns=[np.max, np.min], seed=seed)
metric[f"best@{n}/mean"], metric[f"best@{n}/std"] = bon_mean, bon_std
metric[f"worst@{n}/mean"], metric[f"worst@{n}/std"] = won_mean, won_std
if var2vals.get("pred", None) is not None:
vote_data = [{"val": val, "pred": pred} for val, pred in zip(var_vals, var2vals["pred"])]
[(maj_n_mean, maj_n_std)] = bootstrap_metric(
data=vote_data,
subset_size=n,
reduce_fns=[partial(calc_maj_val, vote_key="pred", val_key="val")],
seed=seed,
)
metric[f"maj@{n}/mean"], metric[f"maj@{n}/std"] = maj_n_mean, maj_n_std
data_src2prompt2var2metric[data_source][prompt][var_name] = metric
# Aggregate metrics across prompts
data_src2var2metric2prompt_vals = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

View File

@ -869,6 +869,7 @@ class RayPPOTrainer:
# currently, we only support validation using the reward_function.
if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
val_metrics = self._validate()
assert val_metrics, f"{val_metrics=}"
pprint(f"Initial validation metrics: {val_metrics}")
logger.log(data=val_metrics, step=self.global_steps)
if self.config.trainer.get("val_only", False):