diff --git a/skyvern/config.py b/skyvern/config.py index d0a4ed5e..e89fdc5c 100644 --- a/skyvern/config.py +++ b/skyvern/config.py @@ -15,6 +15,7 @@ class Settings(BaseSettings): BROWSER_ACTION_TIMEOUT_MS: int = 5000 BROWSER_SCREENSHOT_TIMEOUT_MS: int = 20000 BROWSER_LOADING_TIMEOUT_MS: int = 120000 + OPTION_LOADING_TIMEOUT_MS: int = 600000 MAX_STEPS_PER_RUN: int = 75 MAX_NUM_SCREENSHOTS: int = 10 # Ratio should be between 0 and 1. @@ -73,6 +74,7 @@ class Settings(BaseSettings): ##################### # ACTIVE LLM PROVIDER LLM_KEY: str = "OPENAI_GPT4O" + SECONDARY_LLM_KEY: str | None = None # COMMON LLM_CONFIG_TIMEOUT: int = 300 LLM_CONFIG_MAX_TOKENS: int = 4096 diff --git a/skyvern/exceptions.py b/skyvern/exceptions.py index 963c4ab2..bc8e9161 100644 --- a/skyvern/exceptions.py +++ b/skyvern/exceptions.py @@ -443,3 +443,29 @@ class WrongElementToUploadFile(SkyvernException): class FailedToFetchSecret(SkyvernException): def __init__(self) -> None: super().__init__("Failed to get the actual value of the secret parameter") + + +class NoIncrementalElementFoundForCustomSelection(SkyvernException): + def __init__(self, element_id: str) -> None: + super().__init__( + f"No incremental element found, maybe try an input action or taking the select action on other elements. element_id={element_id}" + ) + + +class NoLabelOrValueForCustomSelection(SkyvernException): + def __init__(self, element_id: str) -> None: + super().__init__( + f"This is a custom selection, there must be invalid text for option.label or option.value. element_id={element_id}" + ) + + +class NoElementMatchedForTargetOption(SkyvernException): + def __init__(self, target: str, reason: str | None) -> None: + super().__init__( + f"No element matches for the target value, try another value. reason: {reason}. target_value='{target}'." + ) + + +class NoElementBoudingBox(SkyvernException): + def __init__(self, element_id: str) -> None: + super().__init__(f"Element does not have a bounding box. element_id={element_id}") diff --git a/skyvern/forge/app.py b/skyvern/forge/app.py index 18a6d1ee..b1480156 100644 --- a/skyvern/forge/app.py +++ b/skyvern/forge/app.py @@ -26,6 +26,9 @@ ARTIFACT_MANAGER = ArtifactManager() BROWSER_MANAGER = BrowserManager() EXPERIMENTATION_PROVIDER: BaseExperimentationProvider = NoOpExperimentationProvider() LLM_API_HANDLER = LLMAPIHandlerFactory.get_llm_api_handler(SettingsManager.get_settings().LLM_KEY) +SECONDARY_LLM_API_HANDLER = LLMAPIHandlerFactory.get_llm_api_handler( + SETTINGS_MANAGER.SECONDARY_LLM_KEY if SETTINGS_MANAGER.SECONDARY_LLM_KEY else SETTINGS_MANAGER.LLM_KEY +) WORKFLOW_CONTEXT_MANAGER = WorkflowContextManager() WORKFLOW_SERVICE = WorkflowService() AGENT_FUNCTION = AgentFunction() diff --git a/skyvern/forge/prompts/skyvern/custom-select.j2 b/skyvern/forge/prompts/skyvern/custom-select.j2 new file mode 100644 index 00000000..fc611f75 --- /dev/null +++ b/skyvern/forge/prompts/skyvern/custom-select.j2 @@ -0,0 +1,25 @@ +You are doing a select action on HTML page. Help to click the best match element for the target value among HTML elements based on the context. +MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. +Each interactable element is tagged with an ID. + +Reply in JSON format with the following keys: +{ + "reasoning": str, // The reasoning behind the action. Be specific, referencing target value and element ids in your reasoning. Mention why you chose the element id. Keep the reasoning short and to the point. + "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence + "id": str, // The id of the element to take action on. The id has to be one from the elements list +} + +Context: +``` +{{ context_reasoning }} +``` + +Target value: +``` +{{ target_value }} +``` + +HTML elements: +``` +{{ elements }} +``` \ No newline at end of file diff --git a/skyvern/forge/prompts/skyvern/opened-dropdown-confirm.j2 b/skyvern/forge/prompts/skyvern/opened-dropdown-confirm.j2 new file mode 100644 index 00000000..6bd85a3b --- /dev/null +++ b/skyvern/forge/prompts/skyvern/opened-dropdown-confirm.j2 @@ -0,0 +1,7 @@ +There is a screenshot from part of Web HTML page. Help me confirm it if it's an opened dropdown menu. +MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. + +Reply in JSON format with the following keys: +{ + "is_opened_dropdown_menu": bool, // true if it's a opened dropdown menu, otherwise false. +} \ No newline at end of file diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index 96d714d1..64800a23 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -3,7 +3,7 @@ import json import os import urllib.parse import uuid -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from typing import Any, Awaitable, Callable, List import structlog @@ -20,12 +20,13 @@ from skyvern.exceptions import ( FailToSelectByLabel, FailToSelectByValue, ImaginaryFileUrl, - InputActionOnSelect2Dropdown, InvalidElementForTextInput, MissingElement, MissingFileUrl, MultipleElementsFound, - NoSelectableElementFound, + NoElementMatchedForTargetOption, + NoIncrementalElementFoundForCustomSelection, + NoLabelOrValueForCustomSelection, OptionIndexOutOfBound, WrongElementToUploadFile, ) @@ -36,6 +37,7 @@ from skyvern.forge.sdk.api.files import ( get_number_of_files_in_directory, get_path_for_workflow_download_directory, ) +from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandler from skyvern.forge.sdk.core.aiohttp_helper import aiohttp_post from skyvern.forge.sdk.core.security import generate_skyvern_signature from skyvern.forge.sdk.db.enums import OrganizationAuthTokenType @@ -56,8 +58,8 @@ from skyvern.webeye.actions.actions import ( ) from skyvern.webeye.actions.responses import ActionFailure, ActionResult, ActionSuccess from skyvern.webeye.browser_factory import BrowserState, get_download_dir -from skyvern.webeye.scraper.scraper import ElementTreeFormat, ScrapedPage -from skyvern.webeye.utils.dom import AbstractSelectDropdown, DomUtil, SkyvernElement +from skyvern.webeye.scraper.scraper import ElementTreeFormat, IncrementalScrapePage, ScrapedPage +from skyvern.webeye.utils.dom import DomUtil, InteractiveElement, SkyvernElement from skyvern.webeye.utils.page import SkyvernFrame LOG = structlog.get_logger() @@ -286,8 +288,6 @@ async def handle_input_text_action( ) -> list[ActionResult]: dom = DomUtil(scraped_page, page) skyvern_element = await dom.get_skyvern_element_by_id(action.element_id) - if await skyvern_element.is_select2_dropdown(): - return [ActionFailure(InputActionOnSelect2Dropdown(element_id=action.element_id))] current_text = await get_input_value(skyvern_element.get_tag_name(), skyvern_element.get_locator()) if current_text == action.text: @@ -469,122 +469,21 @@ async def handle_select_option_action( ) return [ActionFailure(ErrFoundSelectableElement(action.element_id, e))] - if selectable_child is None: - LOG.error( - "No selectable element found in chidren", - tag_name=tag_name, - action=action, - ) - return [ActionFailure(NoSelectableElementFound(action.element_id))] - - LOG.info( - "Found selectable element in the children", - tag_name=selectable_child.get_tag_name(), - element_id=selectable_child.get_id(), - ) - select_action = SelectOptionAction(element_id=selectable_child.get_id(), option=action.option) - return await handle_select_option_action(select_action, page, scraped_page, task, step) - - select_framework: AbstractSelectDropdown | None = None - - if await skyvern_element.is_combobox_dropdown(): - LOG.info( - "This is a combobox dropdown", - action=action, - ) - select_framework = await skyvern_element.get_combobox_dropdown() - if await skyvern_element.is_select2_dropdown(): - LOG.info( - "This is a select2 dropdown", - action=action, - ) - select_framework = await skyvern_element.get_select2_dropdown() - if await skyvern_element.is_react_select_dropdown(): - LOG.info( - "This is a react select dropdown", - action=action, - ) - select_framework = await skyvern_element.get_react_select_dropdown() - - if select_framework is not None: - timeout = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS - - try: - current_value = await select_framework.get_current_value() - if current_value == action.option.label or current_value == action.option.value: - return [ActionSuccess()] - except Exception: + if selectable_child: LOG.info( - "failed to confirm if the select option has been done, force to take the action again.", - exc_info=True, + "Found selectable element in the children", + tag_name=selectable_child.get_tag_name(), + element_id=selectable_child.get_id(), ) + select_action = SelectOptionAction(element_id=selectable_child.get_id(), option=action.option) + return await handle_select_option_action(select_action, page, scraped_page, task, step) - await select_framework.open() - options = await select_framework.get_options() - - result: List[ActionResult] = [] - # select by label first, then by index - if action.option.label is not None or action.option.value is not None: - try: - for option in options: - option_content = option.get("text") - option_index = option.get("optionIndex", None) - if option_index is None: - LOG.warning( - f"{select_framework.name()} option index is None", - option=option, - ) - continue - if action.option.label == option_content or action.option.value == option_content: - await select_framework.select_by_index(index=option_index, timeout=timeout) - result.append(ActionSuccess()) - return result - LOG.info( - f"no target {select_framework.name()} option matched by label, try to select by index", - action=action, - ) - except Exception as e: - result.append(ActionFailure(e)) - LOG.info( - f"failed to select by label in {select_framework.name()}, try to select by index", - exc_info=True, - action=action, - ) - - if action.option.index is not None: - if action.option.index >= len(options): - result.append(ActionFailure(OptionIndexOutOfBound(action.element_id))) - else: - try: - option_content = options[action.option.index].get("text") - if option_content != action.option.label: - LOG.warning( - "Select option label is not consistant to the action value. Might select wrong option.", - option_content=option_content, - action=action, - ) - await select_framework.select_by_index(index=action.option.index, timeout=timeout) - result.append(ActionSuccess()) - return result - except Exception: - result.append(ActionFailure(FailToSelectByIndex(action.element_id))) - LOG.info( - f"failed to select by index in {select_framework.name()}", - exc_info=True, - action=action, - ) - - if len(result) == 0: - result.append(ActionFailure(EmptySelect(action.element_id))) - - if isinstance(result[-1], ActionFailure): - LOG.info( - f"Failed to select a {select_framework.name()} option, close the dropdown", - action=action, - ) - await select_framework.close() - - return result + if tag_name == InteractiveElement.SELECT: + LOG.info( + "SelectOptionAction is on