From 2fdfbdcba6f2e076f64bc47922d8fe6cf7dc7da5 Mon Sep 17 00:00:00 2001 From: Qiao <88535122+Qiao0124@users.noreply.github.com> Date: Sun, 3 Aug 2025 08:04:15 +0800 Subject: [PATCH] [doc] fix: Fix the role assignment error in the interaction demo file and doc. (#2476) ### What does this PR do? Fix the role assignment error in the interaction demo file verl/interactions/gsm8k_interaction.py and doc. The assistant is expected to solve problems, while users provide problems and feedback within the messages list. ### Checklist Before Starting - [x] Search for similar PRs. Paste at least one query link here: ... - [x] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Test Update tests/interactions/test_gsm8k_interaction.py. ### API and Usage Example > Demonstrate how the API changes if any, and provide usage example(s) if possible. ```python # Add code snippet or script demonstrating how to use this ``` ### Design & Code Changes > Demonstrate the high-level design if this PR is complex, and list the specific changes. ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [x] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [x] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [x] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [x] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [x] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). --------- Co-authored-by: H --- docs/sglang_multiturn/interaction_system.rst | 5 +-- tests/interactions/test_gsm8k_interaction.py | 33 ++++++++++---------- verl/interactions/gsm8k_interaction.py | 2 +- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/docs/sglang_multiturn/interaction_system.rst b/docs/sglang_multiturn/interaction_system.rst index 68cf3caa8..812a9484e 100644 --- a/docs/sglang_multiturn/interaction_system.rst +++ b/docs/sglang_multiturn/interaction_system.rst @@ -180,7 +180,7 @@ The GSM8K interaction demonstrates a complete implementation for math problem-so return instance_id async def generate_response(self, instance_id, messages, **kwargs): - # Extract last user message content + # Extract last assistant message content content = "" for item in reversed(messages): if item.get("role") == "assistant": @@ -299,7 +299,8 @@ Comprehensive testing is essential for interaction systems: # Test complete workflow instance_id = await interaction.start_interaction(ground_truth="expected_answer") - messages = [{"role": "user", "content": "user_content"}, {"role": "assistant", "content": "assistant_response"}] + + messages = [{"role": "user", "content": "user_content"}, {"role": "assistant", "content": "assistant_content"}] should_terminate, response, reward, metadata = await interaction.generate_response(instance_id, messages) assert should_terminate in [True, False] diff --git a/tests/interactions/test_gsm8k_interaction.py b/tests/interactions/test_gsm8k_interaction.py index bc16877c2..ac40f32f4 100644 --- a/tests/interactions/test_gsm8k_interaction.py +++ b/tests/interactions/test_gsm8k_interaction.py @@ -80,7 +80,7 @@ class TestGsm8kInteraction: # Setup instance await self.interaction.start_interaction(instance_id=instance_id, ground_truth=ground_truth) - messages = [{"role": "user", "content": "#### 42"}] + messages = [{"role": "assistant", "content": "#### 42"}] with patch("verl.utils.reward_score.gsm8k.compute_score", return_value=1.0): should_terminate, response, reward, metadata = await self.interaction.generate_response( @@ -102,7 +102,7 @@ class TestGsm8kInteraction: # Setup instance await self.interaction.start_interaction(instance_id=instance_id, ground_truth=ground_truth) - messages = [{"role": "user", "content": "42"}] + messages = [{"role": "assistant", "content": "42"}] with patch("verl.utils.reward_score.gsm8k.compute_score", return_value=1.0): should_terminate, response, reward, metadata = await self.interaction.generate_response( @@ -123,7 +123,7 @@ class TestGsm8kInteraction: # Setup instance await self.interaction.start_interaction(instance_id=instance_id, ground_truth=ground_truth) - messages = [{"role": "user", "content": "24"}] + messages = [{"role": "assistant", "content": "24"}] with patch("verl.utils.reward_score.gsm8k.compute_score", return_value=0.0): should_terminate, response, reward, metadata = await self.interaction.generate_response( @@ -137,7 +137,7 @@ class TestGsm8kInteraction: @pytest.mark.asyncio async def test_generate_response_multiple_messages(self): - """Test generate_response with multiple messages (should use last user message).""" + """Test generate_response with multiple messages (should use last assistant message).""" instance_id = "test_instance" ground_truth = "42" @@ -146,8 +146,9 @@ class TestGsm8kInteraction: messages = [ {"role": "user", "content": "What is 2+2?"}, - {"role": "assistant", "content": "Let me think about this..."}, - {"role": "user", "content": "#### 42"}, + {"role": "assistant", "content": "### 4"}, + {"role": "user", "content": "What is 40+2?"}, + {"role": "assistant", "content": "#### 42"}, ] with patch("verl.utils.reward_score.gsm8k.compute_score", return_value=1.0): @@ -160,15 +161,15 @@ class TestGsm8kInteraction: assert self.interaction._instance_dict[instance_id]["response"] == "#### 42" @pytest.mark.asyncio - async def test_generate_response_no_user_message(self): - """Test generate_response with no user messages.""" + async def test_generate_response_no_assistant_message(self): + """Test generate_response with no assistant messages.""" instance_id = "test_instance" ground_truth = "42" # Setup instance await self.interaction.start_interaction(instance_id=instance_id, ground_truth=ground_truth) - messages = [{"role": "assistant", "content": "Hello!"}] + messages = [{"role": "user", "content": "Hello!"}] with patch("verl.utils.reward_score.gsm8k.compute_score", return_value=0.0): should_terminate, response, reward, metadata = await self.interaction.generate_response( @@ -262,7 +263,7 @@ class TestGsm8kInteraction: instance_id = await self.interaction.start_interaction(ground_truth=ground_truth) # Generate response with correct answer - messages = [{"role": "user", "content": "42"}] + messages = [{"role": "assistant", "content": "42"}] with patch("verl.utils.reward_score.gsm8k.compute_score", return_value=1.0): should_terminate, response, reward, metadata = await self.interaction.generate_response( @@ -285,7 +286,7 @@ class TestGsm8kInteraction: instance_id = await self.interaction.start_interaction(ground_truth=ground_truth) # Generate response with incorrect answer - messages = [{"role": "user", "content": "24"}] + messages = [{"role": "assistant", "content": "24"}] with patch("verl.utils.reward_score.gsm8k.compute_score", return_value=0.0): should_terminate, response, reward, metadata = await self.interaction.generate_response( @@ -296,8 +297,8 @@ class TestGsm8kInteraction: assert reward == 0.0 # Continue with another attempt - messages.append({"role": "assistant", "content": response}) - messages.append({"role": "user", "content": "42"}) + messages.append({"role": "user", "content": response}) + messages.append({"role": "assistant", "content": "42"}) with patch("verl.utils.reward_score.gsm8k.compute_score", return_value=1.0): should_terminate, response, reward, metadata = await self.interaction.generate_response( @@ -326,8 +327,8 @@ class TestGsm8kInteraction: assert instance_id_2 in self.interaction._instance_dict # Test responses for both instances - messages_1 = [{"role": "user", "content": "42"}] - messages_2 = [{"role": "user", "content": "24"}] + messages_1 = [{"role": "assistant", "content": "42"}] + messages_2 = [{"role": "assistant", "content": "24"}] with patch("verl.utils.reward_score.gsm8k.compute_score", side_effect=[1.0, 1.0]): should_terminate_1, _, reward_1, _ = await self.interaction.generate_response(instance_id_1, messages_1) @@ -374,7 +375,7 @@ class TestGsm8kInteraction: await self.interaction.start_interaction(instance_id=instance_id, ground_truth=ground_truth) messages = [ - {"role": "user"} # Missing content field + {"role": "assistant"} # Missing content field ] with patch("verl.utils.reward_score.gsm8k.compute_score", return_value=0.0): diff --git a/verl/interactions/gsm8k_interaction.py b/verl/interactions/gsm8k_interaction.py index c30768391..67898ad57 100644 --- a/verl/interactions/gsm8k_interaction.py +++ b/verl/interactions/gsm8k_interaction.py @@ -31,7 +31,7 @@ class Gsm8kInteraction(BaseInteraction): """A demo interaction for calculating the reward of gsm8k. - `start_interaction`: start a interaction instance for a trajectory. - - `generate_response`: generate the response of the user. + - `generate_response`: generate the response of the assistant. - `calculate_score`: calculate the score of the interaction. - `finalize_interaction`: finalize the interaction instance. """