From 15fe658aca7e22782093bac2652af693b1a4c124 Mon Sep 17 00:00:00 2001 From: Rafael Uzarowski Date: Thu, 5 Jun 2025 20:56:36 +0200 Subject: [PATCH] feat: update browser_use to v2 (0.2.5) --- docker/run/fs/ins/install_playwright.sh | 7 +- prompts/default/browser_agent.system.md | 12 ++ python/tools/browser_agent.py | 204 ++++++++++++++---------- requirements.txt | 12 +- 4 files changed, 137 insertions(+), 98 deletions(-) diff --git a/docker/run/fs/ins/install_playwright.sh b/docker/run/fs/ins/install_playwright.sh index 75e16114b..153268946 100644 --- a/docker/run/fs/ins/install_playwright.sh +++ b/docker/run/fs/ins/install_playwright.sh @@ -9,10 +9,9 @@ uv pip install playwright # install chromium with dependencies # for kali-based # if [ "$@" = "hacking" ]; then - apt-get install -y fonts-unifont libnss3 libnspr4 libatk1.0-0 libatspi2.0-0 libxcomposite1 libxdamage1 libatk-bridge2.0-0 - playwright install chromium-headless-shell + apt-get install -y fonts-unifont libnss3 libnspr4 libatk1.0-0 libatspi2.0-0 libxcomposite1 libxdamage1 libatk-bridge2.0-0 libcups2 + playwright install chromium # else # # for debian based -# playwright install --with-deps chromium-headless-shell +# playwright install --with-deps chromium # fi - diff --git a/prompts/default/browser_agent.system.md b/prompts/default/browser_agent.system.md index 963930063..36e083128 100644 --- a/prompts/default/browser_agent.system.md +++ b/prompts/default/browser_agent.system.md @@ -12,6 +12,18 @@ If you are waiting for instructions: you should end the task and mark as done Your responses must always be formatted as a JSON object The response JSON must contain at least the following fields: "title", "response", "page_summary" +## Task Completion +When you have completed the assigned task OR are waiting for further instructions: +1. Use the "Complete task" action to mark the task as complete +2. Provide the required parameters: title, response, and page_summary +3. Do NOT continue taking actions after calling "Complete task" + +## Important Notes +- Always call "Complete task" when your objective is achieved +- If you navigate to a website and no further actions are requested, call "Complete task" immediately +- If you complete any requested interaction (clicking, typing, etc.), call "Complete task" +- Never leave a task running indefinitely - always conclude with "Complete task" + ## Response fields * title (type: str) - The ttitle of the current web page * response (type: str) - Your response to your superior's last request diff --git a/python/tools/browser_agent.py b/python/tools/browser_agent.py index 1ce595d56..888faf08c 100644 --- a/python/tools/browser_agent.py +++ b/python/tools/browser_agent.py @@ -1,17 +1,20 @@ import asyncio import json import time +from typing import Optional from agent import Agent, InterventionException import models from python.helpers.tool import Tool, Response from python.helpers import files, defer, persist_chat, strings from python.helpers.browser_use import browser_use +from python.helpers.print_style import PrintStyle + from python.extensions.message_loop_start._10_iteration_no import get_iter_no from pydantic import BaseModel import uuid from python.helpers.dirty_json import DirtyJson -from langchain_core.messages import SystemMessage + class State: @staticmethod @@ -21,49 +24,47 @@ class State: def __init__(self, agent: Agent): self.agent = agent - self.context = None - self.task = None - self.use_agent = None - self.browser = None + self.browser_session: Optional[browser_use.BrowserSession] = None + self.task: Optional[defer.DeferredTask] = None + self.use_agent: Optional[browser_use.Agent] = None self.iter_no = 0 - def __del__(self): self.kill_task() async def _initialize(self): - if self.context: + if self.browser_session: return - self.browser = browser_use.Browser( - config=browser_use.BrowserConfig( + self.browser_session = browser_use.BrowserSession( + browser_profile=browser_use.BrowserProfile( headless=True, disable_security=True, + chromium_sandbox=False, + minimum_wait_page_load_time=1.0, + wait_for_network_idle_page_load_time=2.0, + maximum_wait_page_load_time=10.0, + args=['--headless=new'], ) ) - # Await the coroutine to get the browser context - self.context = await self.browser.new_context() - - # override async methods to create hooks + await self.browser_session.start() self.override_hooks() - # Add init script to the context - this will be applied to all new pages - await self.context._initialize_session() - pw_context = self.context.session.context # type: ignore - js_override = files.get_abs_path("lib/browser/init_override.js") - await pw_context.add_init_script(path=js_override) # type: ignore + # Add init script to the browser session + if self.browser_session.browser_context: + js_override = files.get_abs_path("lib/browser/init_override.js") + await self.browser_session.browser_context.add_init_script(path=js_override) def start_task(self, task: str): if self.task and self.task.is_alive(): self.kill_task() - if not self.task: - self.task = defer.DeferredTask( - thread_name="BrowserAgent" + self.agent.context.id - ) - if self.agent.context.task: - self.agent.context.task.add_child_task(self.task, terminate_thread=True) + self.task = defer.DeferredTask( + thread_name="BrowserAgent" + self.agent.context.id + ) + if self.agent.context.task: + self.agent.context.task.add_child_task(self.task, terminate_thread=True) self.task.start_task(self._run_task, task) return self.task @@ -71,24 +72,23 @@ class State: if self.task: self.task.kill(terminate_thread=True) self.task = None - self.context = None - self.use_agent = None - self.browser = None - self.iter_no = 0 + if self.browser_session: + try: + import asyncio + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + loop.run_until_complete(self.browser_session.close()) + loop.close() + except Exception as e: + PrintStyle().error(f"Error closing browser session: {e}") + finally: + self.browser_session = None + self.use_agent = None + self.iter_no = 0 async def _run_task(self, task: str): - - agent = self.agent - await self._initialize() - class CustomSystemPrompt(browser_use.SystemPrompt): - def get_system_message(self) -> SystemMessage: - existing_rules = super().get_system_message().text() - new_rules = agent.read_prompt("prompts/browser_agent.system.md") - return SystemMessage(content=f"{existing_rules}\n{new_rules}".strip()) - - # Model of task result class DoneResult(BaseModel): title: str response: str @@ -97,18 +97,16 @@ class State: # Initialize controller controller = browser_use.Controller() - # we overwrite done() in this example to demonstrate the validator - @controller.registry.action("Done with task", param_model=DoneResult) - async def done(params: DoneResult): + # Register custom completion action with proper ActionResult fields + @controller.registry.action("Complete task", param_model=DoneResult) + async def complete_task(params: DoneResult): result = browser_use.ActionResult( - is_done=True, extracted_content=params.model_dump_json() + is_done=True, + success=True, + extracted_content=params.model_dump_json() ) return result - # @controller.action("Ask user for information") - # def ask_user(question: str) -> str: - # return "..." - model = models.get_model( type=models.ModelType.CHAT, provider=self.agent.config.browser_model.provider, @@ -118,27 +116,29 @@ class State: self.use_agent = browser_use.Agent( task=task, - browser_context=self.context, + browser_session=self.browser_session, llm=model, use_vision=self.agent.config.browser_model.vision, - system_prompt_class=CustomSystemPrompt, + extend_system_message=self.agent.read_prompt("prompts/browser_agent.system.md"), controller=controller, + enable_memory=False, # Disable memory to avoid state conflicts ) self.iter_no = get_iter_no(self.agent) - # orig_err_hnd = self.use_agent._handle_step_error - # def new_err_hnd(*args, **kwargs): - # if isinstance(args[0], InterventionException): - # raise args[0] - # return orig_err_hnd(*args, **kwargs) - # self.use_agent._handle_step_error = new_err_hnd - - result = await self.use_agent.run() - return result + try: + result = await self.use_agent.run(max_steps=50) + return result + finally: + if self.browser_session: + try: + await self.browser_session.close() + except Exception as e: + PrintStyle().error(f"Error closing browser session in task cleanup: {e}") + finally: + self.browser_session = None def override_hooks(self): - # override async function to create a hook def override_hook(func): async def wrapper(*args, **kwargs): await self.agent.wait_if_paused() @@ -147,14 +147,24 @@ class State: return await func(*args, **kwargs) return wrapper - if self.context: - self.context.get_state = override_hook(self.context.get_state) - self.context.get_session = override_hook(self.context.get_session) - self.context.remove_highlights = override_hook(self.context.remove_highlights) + if self.browser_session and hasattr(self.browser_session, 'remove_highlights'): + self.browser_session.remove_highlights = override_hook(self.browser_session.remove_highlights) async def get_page(self): + if self.use_agent and self.browser_session: + try: + return await self.use_agent.browser_session.get_current_page() + except Exception: + # Browser session might be closed or invalid + return None + return None + + async def get_selector_map(self): + """Get the selector map for the current page state.""" if self.use_agent: - return await self.use_agent.browser_context.get_current_page() + await self.use_agent.browser_session.get_state_summary(cache_clickable_elements_hashes=True) + return await self.use_agent.browser_session.get_selector_map() + return {} class BrowserAgent(Tool): @@ -165,8 +175,17 @@ class BrowserAgent(Tool): await self.prepare_state(reset=reset) task = self.state.start_task(message) - # wait for browser agent to finish and update progress + # wait for browser agent to finish and update progress with timeout + timeout_seconds = 300 # 5 minute timeout + start_time = time.time() + while not task.is_ready(): + # Check for timeout to prevent infinite waiting + if time.time() - start_time > timeout_seconds: + PrintStyle().warning(f"Browser agent task timeout after {timeout_seconds} seconds, forcing completion") + self.state.kill_task() + break + await self.agent.handle_intervention() await asyncio.sleep(1) try: @@ -177,22 +196,43 @@ class BrowserAgent(Tool): screenshot = update.get("screenshot", None) if screenshot: self.log.update(screenshot=screenshot) - except Exception as e: + except Exception: pass - # collect result - result = await task.result() - answer = result.final_result() + # collect result with error handling try: - if answer and isinstance(answer, str) and answer.strip(): - answer_data = DirtyJson.parse_string(answer) - answer_text = strings.dict_to_text(answer_data) # type: ignore - else: - answer_text = str(answer) if answer else "No result returned" + result = await task.result() + PrintStyle().debug(f"Browser agent task completed, is_done: {result['is_done']}") except Exception as e: - answer_text = str(answer) if answer else f"Error processing result: {str(e)}" + PrintStyle().error(f"Error getting browser agent task result: {str(e)}") + # Return a timeout response if task.result() fails + answer_text = f"Browser agent task failed to return result: {str(e)}" + self.log.update(answer=answer_text) + return Response(message=answer_text, break_loop=False) + finally: + # Stop any further browser access after task completion + self.state.kill_task() + + # Check if task completed successfully + if result['is_done']: + answer = result['final_result'] + try: + if answer and isinstance(answer, str) and answer.strip(): + answer_data = DirtyJson.parse_string(answer) + answer_text = strings.dict_to_text(answer_data) # type: ignore + else: + answer_text = str(answer) if answer else "Task completed successfully" + except Exception as e: + answer_text = str(answer) if answer else f"Task completed with parse error: {str(e)}" + else: + # Task hit max_steps without calling done() + urls = result['urls'] + current_url = urls[-1] if urls else "unknown" + answer_text = (f"Task reached step limit without completion. Last page: {current_url}. " + f"The browser agent may need clearer instructions on when to finish.") + self.log.update(answer=answer_text) - return Response(message=answer, break_loop=False) + return Response(message=answer_text, break_loop=False) def get_log_object(self): return self.agent.context.log.log( @@ -202,9 +242,6 @@ class BrowserAgent(Tool): kvps=self.args, ) - # async def after_execution(self, response, **kwargs): - # await self.agent.hist_add_tool_result(self.name, response.message) - async def get_update(self): await self.prepare_state() @@ -212,7 +249,6 @@ class BrowserAgent(Tool): agent = self.agent ua = self.state.use_agent page = await self.state.get_page() - ctx = self.state.context if ua and page: try: @@ -223,14 +259,6 @@ class BrowserAgent(Tool): log = [] - # dom_service = browser_use.DomService(page) - # dom_state = await browser_use.utils.time_execution_sync('get_clickable_elements')( - # dom_service.get_clickable_elements - # )() - # elements = dom_state.element_tree - # selector_map = dom_state.selector_map - # el_text = elements.clickable_elements_to_string() - for message in ua.message_manager.get_messages(): if message.type == "system": continue @@ -265,7 +293,7 @@ class BrowserAgent(Tool): if self.state.task: await self.state.task.execute_inside(_get_update) - except Exception as e: + except Exception: pass return result diff --git a/requirements.txt b/requirements.txt index 91ee3b7d9..1b3017f41 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ a2wsgi==1.10.8 ansio==0.0.1 -beautifulsoup4==4.12.3 -browser-use==0.1.40 +beautifulsoup4==4.13.4 +browser-use==0.2.5 docker==7.1.0 duckduckgo-search==6.1.12 -faiss-cpu==1.8.0.post1 +faiss-cpu==1.11.0 fastmcp==2.3.4 flask[async]==3.0.3 flask-basicauth==0.2.0 @@ -13,12 +13,12 @@ GitPython==3.1.43 inputimeout==1.0.4 langchain-anthropic==0.3.3 langchain-community==0.3.19 -langchain-google-genai==2.0.8 +langchain-google-genai==2.1.2 langchain-groq==0.2.2 langchain-huggingface==0.1.2 langchain-mistralai==0.2.4 -langchain-ollama==0.2.2 -langchain-openai==0.3.1 +langchain-ollama==0.3.0 +langchain-openai==0.3.11 openai-whisper==20240930 lxml_html_clean==0.3.1 markdown==3.7