Compare commits

...

2 Commits

Author SHA1 Message Date
d88265cf09 Small fix in planning 2024-09-02 09:11:49 +02:00
652488ecdb Make easier tags 2024-08-30 19:43:12 +02:00
3 changed files with 189 additions and 161 deletions

View File

@ -110,19 +110,18 @@ def parse_json_blob(json_blob: str) -> Dict[str, str]:
def parse_code_blob(code_blob: str) -> str:
try:
pattern = r"```(?:py|python)?\n(.*?)\n```"
match = re.search(pattern, code_blob, re.DOTALL)
return match.group(1).strip()
return code_blob.strip()
except Exception as e:
raise ValueError(
f"""
The code blob you used is invalid: due to the following error: {e}
This means that the regex pattern {pattern} was not respected: make sure to include code with the correct pattern, for instance:
Thoughts: Your thoughts
Code:
```py
This means that the regex pattern was not respected: make sure to include code with the correct pattern, for instance:
<thoughts>
Your thoughts
<end_thoughts>
<code>
# Your python code here
```<end_action>"""
<end_code>"""
)
@ -139,27 +138,6 @@ def parse_json_tool_call(json_blob: str) -> Tuple[str, Dict[str, str]]:
)
def parse_text_tool_call(text: str) -> Tuple[str, Union[str, Dict[str, str]]]:
"""
Expects a text in the format: 'Action:', 'Action input:', 'Observation:'. 'Action input:' contains a json string with input arguments.
"""
try:
if "Observation:" in text:
text = text.split("Observation:")[0]
if "Action:" in text:
text = text.split("Action:")[1]
tool_name, tool_input = text.split("Action input:")
if "{" in tool_input:
tool_input = parse_json_blob(tool_input)
else:
tool_input = tool_input.strip().replace('"', "")
return tool_name.strip().replace('"', "").replace("\\", ""), tool_input
except Exception as e:
raise ValueError(
f"Error in parsing the text tool call: {e}. Be sure to provide the correct format. DO NOT repeat your previous incorrect tool call."
)
def to_text(input: Union[List[Dict[str, str]], Dict[str, str], str]) -> str:
if isinstance(input, list):
return "\n".join([m["content"] for m in input])
@ -406,7 +384,7 @@ class Agent:
prompt_message = {"role": MessageRole.SYSTEM, "content": self.logs[0]["system_prompt"]}
task_message = {
"role": MessageRole.USER,
"content": "Task: " + self.logs[0]["task"],
"content": "<task>" + self.logs[0]["task"] + "<end_task>",
}
if summary_mode:
memory = [task_message]
@ -419,37 +397,42 @@ class Agent:
if "facts" in step_log:
thought_message = {
"role": MessageRole.ASSISTANT,
"content": "[FACTS LIST]:\n" + step_log["facts"].strip(),
"content": "<facts_list>\n" + step_log["facts"].strip() + "\n<end_facts_list>",
}
memory.append(thought_message)
if "plan" in step_log and not summary_mode:
thought_message = {"role": MessageRole.ASSISTANT, "content": "[PLAN]:\n" + step_log["plan"].strip()}
thought_message = {
"role": MessageRole.ASSISTANT,
"content": "<plan>\n" + step_log["plan"].strip() + "\n<end_plan",
}
memory.append(thought_message)
if "tool_call" in step_log and summary_mode:
tool_call_message = {
"role": MessageRole.ASSISTANT,
"content": f"[STEP {i} TOOL CALL]: " + str(step_log["tool_call"]).strip(),
"content": f"<step_{i}_tool_call>\n "
+ str(step_log["tool_call"]).strip()
+ f"\n<end_step_{i}_tool_call>",
}
memory.append(tool_call_message)
if "task" in step_log:
tool_call_message = {
"role": MessageRole.USER,
"content": "New task:\n" + step_log["task"],
"content": "<new_task>\n" + step_log["task"] + "\n<end_new_task>",
}
memory.append(tool_call_message)
if "error" in step_log or "observation" in step_log:
if "error" in step_log:
message_content = (
f"[OUTPUT OF STEP {i}] Error: "
"<error>\n"
+ str(step_log["error"])
+ "\nNow let's retry: take care not to repeat previous errors! If you have retried several times, try a completely different approach.\n"
+ "\nNow let's retry: take care not to repeat previous errors! If you have retried several times, try a completely different approach.\n<end_error>"
)
elif "observation" in step_log:
message_content = f"[OUTPUT OF STEP {i}] Observation:\n{step_log['observation']}"
message_content = f"<observation>\n{step_log['observation']}\n<end_observation>"
tool_response_message = {"role": MessageRole.TOOL_RESPONSE, "content": message_content}
memory.append(tool_response_message)
@ -597,7 +580,7 @@ class CodeAgent(Agent):
prompt_message = {"role": MessageRole.SYSTEM, "content": self.system_prompt}
task_message = {
"role": MessageRole.USER,
"content": "Task: " + self.task,
"content": "<task>" + self.task + "<end_task>",
}
self.prompt = [prompt_message, task_message]
@ -612,7 +595,7 @@ class CodeAgent(Agent):
# Parse
try:
_, code_action = self.extract_action(llm_output=llm_output, split_token="Code:")
_, code_action = self.extract_action(llm_output=llm_output, split_token="<code>")
except Exception as e:
self.logger.debug(
f"Error in extracting action, trying to parse the whole output as code. Error trace: {e}"
@ -934,7 +917,7 @@ class ReactJsonAgent(ReactAgent):
# Parse
self.logger.debug("===== Extracting action =====")
rationale, action = self.extract_action(llm_output=llm_output, split_token="Action:")
rationale, action = self.extract_action(llm_output=llm_output, split_token="<action>")
try:
tool_name, arguments = self.tool_parser(action)
@ -1044,9 +1027,7 @@ class ReactCodeAgent(ReactAgent):
try:
additional_args = {"grammar": self.grammar} if self.grammar is not None else {}
llm_output = self.llm_engine(
self.prompt, stop_sequences=["<end_action>", "Observation:"], **additional_args
)
llm_output = self.llm_engine(self.prompt, stop_sequences=["<end_code>", "<end_action>"], **additional_args)
except Exception as e:
raise AgentGenerationError(f"Error in generating llm output: {e}.")
@ -1057,7 +1038,7 @@ class ReactCodeAgent(ReactAgent):
# Parse
self.logger.debug("===== Extracting action =====")
try:
rationale, raw_code_action = self.extract_action(llm_output=llm_output, split_token="Code:")
rationale, raw_code_action = self.extract_action(llm_output=llm_output, split_token="<code>")
except Exception as e:
self.logger.debug(f"Error in extracting action, trying to parse the whole output. Error trace: {e}")
rationale, raw_code_action = llm_output, llm_output
@ -1066,6 +1047,7 @@ class ReactCodeAgent(ReactAgent):
code_action = parse_code_blob(raw_code_action)
except Exception as e:
error_msg = f"Error in code parsing: {e}. Make sure to provide correct code"
print("CODE GIVEN:", raw_code_action)
raise AgentParsingError(error_msg)
current_step_logs["rationale"] = rationale

View File

@ -30,7 +30,7 @@ def pull_message(step_log: dict):
used_code = step_log["tool_call"]["tool_name"] == "code interpreter"
content = step_log["tool_call"]["tool_arguments"]
if used_code:
content = f"```py\n{content}\n```"
content = f"<code>\n{content}\n</code>"
yield ChatMessage(
role="assistant",
metadata={"title": f"🛠️ Used tool {step_log['tool_call']['tool_name']}"},

View File

@ -61,72 +61,78 @@ Tools:
Examples:
---
Task: "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French."
<task>"Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French."<end_task>
Thought: I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
Code:
```py
<thought>
I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
<end_thought>
<code>
translated_question = translator(question=question, src_lang="French", tgt_lang="English")
print(f"The translated question is {translated_question}.")
answer = image_qa(image=image, question=translated_question)
final_answer(f"The answer is {answer}")
```<end_action>
<end_code>
---
Task: "Identify the oldest person in the `document` and create an image showcasing the result."
<task>"Identify the oldest person in the `document` and create an image showcasing the result."<end_task>
Thought: I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
Code:
```py
<thought>
I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
<end_thought>
<code>
answer = document_qa(document, question="What is the oldest person?")
print(f"The answer is {answer}.")
image = image_generator(answer)
final_answer(image)
```<end_action>
<end_code>
---
Task: "Generate an image using the text given in the variable `caption`."
<task>"Generate an image using the text given in the variable `caption`."<end_task>
Thought: I will use the following tool: `image_generator` to generate an image.
Code:
```py
<thought>
I will use the following tool: `image_generator` to generate an image.
<end_thought>
<code>
image = image_generator(prompt=caption)
final_answer(image)
```<end_action>
<end_code>
---
Task: "Summarize the text given in the variable `text` and read it out loud."
<task>"Summarize the text given in the variable `text` and read it out loud."<end_task>
Thought: I will use the following tools: `summarizer` to create a summary of the input text, then `text_reader` to read it out loud.
Code:
```py
<thought>
I will use the following tools: `summarizer` to create a summary of the input text, then `text_reader` to read it out loud.
<end_thought>
<code>
summarized_text = summarizer(text)
print(f"Summary: {summarized_text}")
audio_summary = text_reader(summarized_text)
final_answer(audio_summary)
```<end_action>
<end_code>
---
Task: "Answer the question in the variable `question` about the text in the variable `text`. Use the answer to generate an image."
<task>"Answer the question in the variable `question` about the text in the variable `text`. Use the answer to generate an image."<end_task>
Thought: I will use the following tools: `text_qa` to create the answer, then `image_generator` to generate an image according to the answer.
Code:
```py
<thought>
I will use the following tools: `text_qa` to create the answer, then `image_generator` to generate an image according to the answer.
<end_thought>
<code>
answer = text_qa(text=text, question=question)
print(f"The answer is {answer}.")
image = image_generator(answer)
final_answer(image)
```<end_action>
<end_code>
---
Task: "Caption the following `image`."
<task>"Caption the following `image`."<end_task>
Thought: I will use the following tool: `image_captioner` to generate a caption for the image.
Code:
```py
<thought>
I will use the following tool: `image_captioner` to generate a caption for the image.
<end_thought>
<code>
caption = image_captioner(image)
final_answer(caption)
```<end_action>
<end_code>
---
Above example were using tools that might not exist for you. You only have acces to those Tools:
@ -149,118 +155,149 @@ The $ACTION_JSON_BLOB should only contain a SINGLE action, do NOT return a list
{
"action": $TOOL_NAME,
"action_input": $INPUT
}<end_action>
}
Make sure to have the $INPUT as a dictionary in the right format for the tool you are using, and do not put variable names as input if you can find the right values.
You should ALWAYS use the following format:
Thought: you should always think about one action to take. Then use the action as follows:
Action:
<thought>
you should always think about one action to take. Then use the action as follows:
<end_thought>
<action>
$ACTION_JSON_BLOB
Observation: the result of the action
<end_action>
<observation>the result of the action<end_observation>
... (this Thought/Action/Observation can repeat N times, you should take several steps when needed. The $ACTION_JSON_BLOB must only use a SINGLE action at a time.)
You can use the result of the previous action as input for the next action.
The observation will always be a string: it can represent a file, like "image_1.jpg".
Then you can use it as input for the next action. You can do it for instance as follows:
Observation: "image_1.jpg"
<observation>"image_1.jpg"<end_observation>
Thought: I need to transform the image that I received in the previous observation to make it green.
Action:
<thought>
I need to transform the image that I received in the previous observation to make it green.
<end_thought>
<action>
{
"action": "image_transformer",
"action_input": {"image": "image_1.jpg"}
}<end_action>
}
<end_action>
To provide the final answer to the task, use an action blob with "action": "final_answer" tool. It is the only way to complete the task, else you will be stuck on a loop. So your final output should look like this:
Action:
<end_thought>
<action>
{
"action": "final_answer",
"action_input": {"answer": "insert your final answer here"}
}<end_action>
}
<end_action>
Here are a few examples using notional tools:
---
Task: "Generate an image of the oldest person in this document."
<task>"Generate an image of the oldest person in this document."<end_task>
Thought: I will proceed step by step and use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
Action:
<thought>
I will proceed step by step and use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
<end_thought>
<action>
{
"action": "document_qa",
"action_input": {"document": "document.pdf", "question": "Who is the oldest person mentioned?"}
}<end_action>
Observation: "The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland."
}
<end_action>
<observation>"The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland."<end_observation>
Thought: I will now generate an image showcasing the oldest person.
Action:
<thought>
I will now generate an image showcasing the oldest person.
<end_thought>
<action>
{
"action": "image_generator",
"action_input": {"text": ""A portrait of John Doe, a 55-year-old man living in Canada.""}
}<end_action>
Observation: "image.png"
}
<end_action>
<observation>"image.png"<end_observation>
Thought: I will now return the generated image.
Action:
<thought>
I will now return the generated image.
<end_thought>
<action>
{
"action": "final_answer",
"action_input": "image.png"
}<end_action>
}
<end_action>
---
Task: "What is the result of the following operation: 5 + 3 + 1294.678?"
<task>"What is the result of the following operation: 5 + 3 + 1294.678?"<end_task>
Thought: I will use python code evaluator to compute the result of the operation and then return the final answer using the `final_answer` tool
Action:
<thought>
I will use python code evaluator to compute the result of the operation and then return the final answer using the `final_answer` tool
<end_thought>
<action>
{
"action": "python_interpreter",
"action_input": {"code": "5 + 3 + 1294.678"}
}<end_action>
Observation: 1302.678
}
<end_action>
<observation>1302.678<end_observation>
Thought: Now that I know the result, I will now return it.
Action:
<thought>
Now that I know the result, I will now return it.
<end_thought>
<action>
{
"action": "final_answer",
"action_input": "1302.678"
}<end_action>
}
<end_action>
---
Task: "Which city has the highest population , Guangzhou or Shanghai?"
<task>"Which city has the highest population , Guangzhou or Shanghai?"<end_task>
Thought: I need to get the populations for both cities and compare them: I will use the tool `search` to get the population of both cities.
Action:
<thought>
I need to get the populations for both cities and compare them: I will use the tool `search` to get the population of both cities.
<end_thought>
<action>
{
"action": "search",
"action_input": "Population Guangzhou"
}<end_action>
Observation: ['Guangzhou has a population of 15 million inhabitants as of 2021.']
}
<end_action>
<observation>['Guangzhou has a population of 15 million inhabitants as of 2021.']<end_observation>
Thought: Now let's get the population of Shanghai using the tool 'search'.
Action:
<thought>
Now let's get the population of Shanghai using the tool 'search'.
<end_thought>
<action>
{
"action": "search",
"action_input": "Population Shanghai"
}
Observation: '26 million (2019)'
<observation>'26 million (2019)'<end_observation>
Thought: Now I know that Shanghai has a larger population. Let's return the result.
Action:
<thought>
Now I know that Shanghai has a larger population. Let's return the result.
<end_thought>
<action>
{
"action": "final_answer",
"action_input": "Shanghai"
}<end_action>
}
<end_action>
Above example were using notional tools that might not exist for you. You only have acces to those tools:
<<tool_descriptions>>
Here are the rules you should always follow to solve your task:
1. ALWAYS provide a 'Thought:' sequence, and an 'Action:' sequence that ends with <end_action>, else you will fail.
1. ALWAYS provide a <thought> sequence ending with <end_thought>, and an '<action>' sequence that ends with <end_action>, else you will fail.
2. Always use the right arguments for the tools. Never use variable names in the 'action_input' field, use the value instead.
3. Call a tool only when needed: do not call the search agent if you do not need information, try to solve the task yourself.
4. Never re-do a tool call that you previously did with the exact same parameters.
@ -271,82 +308,91 @@ Now Begin! If you solve the task correctly, you will receive a reward of $1,000,
DEFAULT_REACT_CODE_SYSTEM_PROMPT = """You are an expert assistant who can solve any task using code blobs. You will be given a task to solve as best you can.
To do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
To solve the task, you must plan forward to proceed in a series of steps, in a cycle of '<thought>', '<code>', and '<observation>' sequences.
At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_action>' sequence.
At each step, in the '<thought>' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
Then in the '<code>' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence. Don't put any other thing than code in the <code><end_code> sequence, any markdown for instance will give an error.
During each intermediate step, you can use 'print()' to save whatever important information you will then need.
These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
These print outputs will then appear in the '<observation>' field, which will be available as input for the next step.
In the end you have to return a final answer using the `final_answer` tool.
Here are a few examples using notional tools:
---
Task: "Generate an image of the oldest person in this document."
<task>"Generate an image of the oldest person in this document."<end_task>
Thought: I will proceed step by step and use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
Code:
```py
<thought>
I will proceed step by step and use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
<end_thought>
<code>
answer = document_qa(document=document, question="Who is the oldest person mentioned?")
print(answer)
```<end_action>
Observation: "The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland."
<end_code>
<observation>"The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland."<end_observation>
Thought: I will now generate an image showcasing the oldest person.
Code:
```py
<thought>
I will now generate an image showcasing the oldest person.
<end_thought>
<code>
image = image_generator("A portrait of John Doe, a 55-year-old man living in Canada.")
final_answer(image)
```<end_action>
<end_code>
---
Task: "What is the result of the following operation: 5 + 3 + 1294.678?"
<task>"What is the result of the following operation: 5 + 3 + 1294.678?"<end_task>
Thought: I will use python code to compute the result of the operation and then return the final answer using the `final_answer` tool
Code:
```py
<thought>
I will use python code to compute the result of the operation and then return the final answer using the `final_answer` tool
<end_thought>
<code>
result = 5 + 3 + 1294.678
final_answer(result)
```<end_action>
<end_code>
---
Task: "Which city has the highest population: Guangzhou or Shanghai?"
<task>"Which city has the highest population: Guangzhou or Shanghai?"<end_task>
Thought: I need to get the populations for both cities and compare them: I will use the tool `search` to get the population of both cities.
Code:
```py
<thought>
I need to get the populations for both cities and compare them: I will use the tool `search` to get the population of both cities.
<end_thought>
<code>
population_guangzhou = search("Guangzhou population")
print("Population Guangzhou:", population_guangzhou)
population_shanghai = search("Shanghai population")
print("Population Shanghai:", population_shanghai)
```<end_action>
Observation:
<end_code>
<observation>
Population Guangzhou: ['Guangzhou has a population of 15 million inhabitants as of 2021.']
Population Shanghai: '26 million (2019)'
<end_observation>
Thought: Now I know that Shanghai has the highest population.
Code:
```py
<thought>
Now I know that Shanghai has the highest population.
<end_thought>
<code>
final_answer("Shanghai")
```<end_action>
<end_code>
---
Task: "What is the current age of the pope, raised to the power 0.36?"
<task>"What is the current age of the pope, raised to the power 0.36?"<end_task>
Thought: I will use the tool `search` to get the age of the pope, then raise it to the power 0.36.
Code:
```py
<thought>
I will use the tool `search` to get the age of the pope, then raise it to the power 0.36.
<end_thought>
<code>
pope_age = search(query="current pope age")
print("Pope age:", pope_age)
```<end_action>
Observation:
<end_code>
<observation>
Pope age: "The pope Francis is currently 85 years old."
<end_observation>
Thought: I know that the pope is 85 years old. Let's compute the result using python code.
Code:
```py
<thought>
I know that the pope is 85 years old. Let's compute the result using python code.
<end_thought>
<code>
pope_current_age = 85 ** 0.36
final_answer(pope_current_age)
```<end_action>
<end_code>
Above example were using notional tools that might not exist for you. You only have acces to those tools:
@ -355,7 +401,7 @@ Above example were using notional tools that might not exist for you. You only h
You also can perform computations in the Python code that you generate.
Here are the rules you should always follow to solve your task:
1. Always provide a 'Thought:' sequence, and a 'Code:\n```py' sequence ending with '```<end_action>' sequence, else you will fail.
1. Always provide a '<thought>' sequence, and a '<code>' sequence ending with '<end_code>' sequence, else you will fail.
2. Use only variables that you have defined!
3. Always use the right arguments for the tools. DO NOT pass the arguments as a dict as in 'answer = ask_search_agent({'query': "What is the place where James Bond lives?"})', but use the arguments directly as in 'answer = ask_search_agent(query="What is the place where James Bond lives?")'.
4. Take care to not chain too many sequential tool calls in the same code block, especially when the output format is unpredictable. For instance, a call to search has an unpredictable return format, so do not have another tool call that depends on its output in the same block: rather output results with print() to use them in the next block.
@ -510,10 +556,10 @@ How many encoder blocks were in the first attention-only ML architecture publish
```
[STEP 1 TOOL CALL]: {'tool_name': 'code interpreter', 'tool_arguments': '# Step 1: Identify the title and authors of the paper that first introduced an attention-only ML architecture.\nanswer = ask_search_agent(query="Can you find the title and authors of the paper that first introduced an attention-only machine learning architecture? Please provide the full citation.")\nprint(answer)'}
[OUTPUT OF STEP 1] Observation: **Title**: Attention Is All You Need
[OUTPUT OF STEP 1] <observation>**Title**: Attention Is All You Need<end_observation>
**Authors**: Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin
[STEP 2 TOOL CALL]: {'tool_name': 'code interpreter', 'tool_arguments': '# Step 1: Find the full text of the identified paper on arXiv\\npaper_url = "https://arxiv.org/pdf/1706.03762.pdf"\\nprint(paper_url)'}
[OUTPUT OF STEP 2] Observation: https://arxiv.org/pdf/1706.03762.pdf
[OUTPUT OF STEP 2] <observation>https://arxiv.org/pdf/1706.03762.pdf<end_observation>
---
Output plan:
@ -656,10 +702,10 @@ How many encoder blocks were in the first attention-only ML architecture publish
```
[STEP 1 TOOL CALL]: {{'tool_name': 'code interpreter', 'tool_arguments': '# Step 1: Identify the title and authors of the paper that first introduced an attention-only ML architecture.\nanswer = ask_search_agent(query="Can you find the title and authors of the paper that first introduced an attention-only machine learning architecture? Please provide the full citation.")\nprint(answer)'}}
[OUTPUT OF STEP 1] Observation: **Title**: Attention Is All You Need
[OUTPUT OF STEP 1] <observation>**Title**: Attention Is All You Need
**Authors**: Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin
[STEP 2 TOOL CALL]: {{'tool_name': 'code interpreter', 'tool_arguments': '# Step 1: Find the full text of the identified paper on arXiv\\npaper_url = "https://arxiv.org/pdf/1706.03762.pdf"\\nprint(paper_url)'}}
[OUTPUT OF STEP 2] Observation: https://arxiv.org/pdf/1706.03762.pdf
[OUTPUT OF STEP 2] <observation>https://arxiv.org/pdf/1706.03762.pdf
---
Output plan: