general selection (#675)

This commit is contained in:
LawyZheng 2024-08-06 13:30:52 +08:00 committed by GitHub
parent 845ae8d3e4
commit cba0f68a5e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 655 additions and 241 deletions

View file

@ -15,6 +15,7 @@ class Settings(BaseSettings):
BROWSER_ACTION_TIMEOUT_MS: int = 5000
BROWSER_SCREENSHOT_TIMEOUT_MS: int = 20000
BROWSER_LOADING_TIMEOUT_MS: int = 120000
OPTION_LOADING_TIMEOUT_MS: int = 600000
MAX_STEPS_PER_RUN: int = 75
MAX_NUM_SCREENSHOTS: int = 10
# Ratio should be between 0 and 1.
@ -73,6 +74,7 @@ class Settings(BaseSettings):
#####################
# ACTIVE LLM PROVIDER
LLM_KEY: str = "OPENAI_GPT4O"
SECONDARY_LLM_KEY: str | None = None
# COMMON
LLM_CONFIG_TIMEOUT: int = 300
LLM_CONFIG_MAX_TOKENS: int = 4096

View file

@ -443,3 +443,29 @@ class WrongElementToUploadFile(SkyvernException):
class FailedToFetchSecret(SkyvernException):
def __init__(self) -> None:
super().__init__("Failed to get the actual value of the secret parameter")
class NoIncrementalElementFoundForCustomSelection(SkyvernException):
def __init__(self, element_id: str) -> None:
super().__init__(
f"No incremental element found, maybe try an input action or taking the select action on other elements. element_id={element_id}"
)
class NoLabelOrValueForCustomSelection(SkyvernException):
def __init__(self, element_id: str) -> None:
super().__init__(
f"This is a custom selection, there must be invalid text for option.label or option.value. element_id={element_id}"
)
class NoElementMatchedForTargetOption(SkyvernException):
def __init__(self, target: str, reason: str | None) -> None:
super().__init__(
f"No element matches for the target value, try another value. reason: {reason}. target_value='{target}'."
)
class NoElementBoudingBox(SkyvernException):
def __init__(self, element_id: str) -> None:
super().__init__(f"Element does not have a bounding box. element_id={element_id}")

View file

@ -26,6 +26,9 @@ ARTIFACT_MANAGER = ArtifactManager()
BROWSER_MANAGER = BrowserManager()
EXPERIMENTATION_PROVIDER: BaseExperimentationProvider = NoOpExperimentationProvider()
LLM_API_HANDLER = LLMAPIHandlerFactory.get_llm_api_handler(SettingsManager.get_settings().LLM_KEY)
SECONDARY_LLM_API_HANDLER = LLMAPIHandlerFactory.get_llm_api_handler(
SETTINGS_MANAGER.SECONDARY_LLM_KEY if SETTINGS_MANAGER.SECONDARY_LLM_KEY else SETTINGS_MANAGER.LLM_KEY
)
WORKFLOW_CONTEXT_MANAGER = WorkflowContextManager()
WORKFLOW_SERVICE = WorkflowService()
AGENT_FUNCTION = AgentFunction()

View file

@ -0,0 +1,25 @@
You are doing a select action on HTML page. Help to click the best match element for the target value among HTML elements based on the context.
MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc.
Each interactable element is tagged with an ID.
Reply in JSON format with the following keys:
{
"reasoning": str, // The reasoning behind the action. Be specific, referencing target value and element ids in your reasoning. Mention why you chose the element id. Keep the reasoning short and to the point.
"confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
"id": str, // The id of the element to take action on. The id has to be one from the elements list
}
Context:
```
{{ context_reasoning }}
```
Target value:
```
{{ target_value }}
```
HTML elements:
```
{{ elements }}
```

View file

@ -0,0 +1,7 @@
There is a screenshot from part of Web HTML page. Help me confirm it if it's an opened dropdown menu.
MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc.
Reply in JSON format with the following keys:
{
"is_opened_dropdown_menu": bool, // true if it's a opened dropdown menu, otherwise false.
}

View file

@ -3,7 +3,7 @@ import json
import os
import urllib.parse
import uuid
from datetime import datetime, timedelta
from datetime import datetime, timedelta, timezone
from typing import Any, Awaitable, Callable, List
import structlog
@ -20,12 +20,13 @@ from skyvern.exceptions import (
FailToSelectByLabel,
FailToSelectByValue,
ImaginaryFileUrl,
InputActionOnSelect2Dropdown,
InvalidElementForTextInput,
MissingElement,
MissingFileUrl,
MultipleElementsFound,
NoSelectableElementFound,
NoElementMatchedForTargetOption,
NoIncrementalElementFoundForCustomSelection,
NoLabelOrValueForCustomSelection,
OptionIndexOutOfBound,
WrongElementToUploadFile,
)
@ -36,6 +37,7 @@ from skyvern.forge.sdk.api.files import (
get_number_of_files_in_directory,
get_path_for_workflow_download_directory,
)
from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandler
from skyvern.forge.sdk.core.aiohttp_helper import aiohttp_post
from skyvern.forge.sdk.core.security import generate_skyvern_signature
from skyvern.forge.sdk.db.enums import OrganizationAuthTokenType
@ -56,8 +58,8 @@ from skyvern.webeye.actions.actions import (
)
from skyvern.webeye.actions.responses import ActionFailure, ActionResult, ActionSuccess
from skyvern.webeye.browser_factory import BrowserState, get_download_dir
from skyvern.webeye.scraper.scraper import ElementTreeFormat, ScrapedPage
from skyvern.webeye.utils.dom import AbstractSelectDropdown, DomUtil, SkyvernElement
from skyvern.webeye.scraper.scraper import ElementTreeFormat, IncrementalScrapePage, ScrapedPage
from skyvern.webeye.utils.dom import DomUtil, InteractiveElement, SkyvernElement
from skyvern.webeye.utils.page import SkyvernFrame
LOG = structlog.get_logger()
@ -286,8 +288,6 @@ async def handle_input_text_action(
) -> list[ActionResult]:
dom = DomUtil(scraped_page, page)
skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
if await skyvern_element.is_select2_dropdown():
return [ActionFailure(InputActionOnSelect2Dropdown(element_id=action.element_id))]
current_text = await get_input_value(skyvern_element.get_tag_name(), skyvern_element.get_locator())
if current_text == action.text:
@ -469,14 +469,7 @@ async def handle_select_option_action(
)
return [ActionFailure(ErrFoundSelectableElement(action.element_id, e))]
if selectable_child is None:
LOG.error(
"No selectable element found in chidren",
tag_name=tag_name,
action=action,
)
return [ActionFailure(NoSelectableElementFound(action.element_id))]
if selectable_child:
LOG.info(
"Found selectable element in the children",
tag_name=selectable_child.get_tag_name(),
@ -485,106 +478,12 @@ async def handle_select_option_action(
select_action = SelectOptionAction(element_id=selectable_child.get_id(), option=action.option)
return await handle_select_option_action(select_action, page, scraped_page, task, step)
select_framework: AbstractSelectDropdown | None = None
if await skyvern_element.is_combobox_dropdown():
if tag_name == InteractiveElement.SELECT:
LOG.info(
"This is a combobox dropdown",
"SelectOptionAction is on <select>",
action=action,
)
select_framework = await skyvern_element.get_combobox_dropdown()
if await skyvern_element.is_select2_dropdown():
LOG.info(
"This is a select2 dropdown",
action=action,
)
select_framework = await skyvern_element.get_select2_dropdown()
if await skyvern_element.is_react_select_dropdown():
LOG.info(
"This is a react select dropdown",
action=action,
)
select_framework = await skyvern_element.get_react_select_dropdown()
if select_framework is not None:
timeout = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
try:
current_value = await select_framework.get_current_value()
if current_value == action.option.label or current_value == action.option.value:
return [ActionSuccess()]
except Exception:
LOG.info(
"failed to confirm if the select option has been done, force to take the action again.",
exc_info=True,
)
await select_framework.open()
options = await select_framework.get_options()
result: List[ActionResult] = []
# select by label first, then by index
if action.option.label is not None or action.option.value is not None:
try:
for option in options:
option_content = option.get("text")
option_index = option.get("optionIndex", None)
if option_index is None:
LOG.warning(
f"{select_framework.name()} option index is None",
option=option,
)
continue
if action.option.label == option_content or action.option.value == option_content:
await select_framework.select_by_index(index=option_index, timeout=timeout)
result.append(ActionSuccess())
return result
LOG.info(
f"no target {select_framework.name()} option matched by label, try to select by index",
action=action,
)
except Exception as e:
result.append(ActionFailure(e))
LOG.info(
f"failed to select by label in {select_framework.name()}, try to select by index",
exc_info=True,
action=action,
)
if action.option.index is not None:
if action.option.index >= len(options):
result.append(ActionFailure(OptionIndexOutOfBound(action.element_id)))
else:
try:
option_content = options[action.option.index].get("text")
if option_content != action.option.label:
LOG.warning(
"Select option label is not consistant to the action value. Might select wrong option.",
option_content=option_content,
action=action,
)
await select_framework.select_by_index(index=action.option.index, timeout=timeout)
result.append(ActionSuccess())
return result
except Exception:
result.append(ActionFailure(FailToSelectByIndex(action.element_id)))
LOG.info(
f"failed to select by index in {select_framework.name()}",
exc_info=True,
action=action,
)
if len(result) == 0:
result.append(ActionFailure(EmptySelect(action.element_id)))
if isinstance(result[-1], ActionFailure):
LOG.info(
f"Failed to select a {select_framework.name()} option, close the dropdown",
action=action,
)
await select_framework.close()
return result
return await normal_select(action=action, skyvern_element=skyvern_element)
if await skyvern_element.is_checkbox():
LOG.info(
@ -602,7 +501,99 @@ async def handle_select_option_action(
click_action = ClickAction(element_id=action.element_id)
return await chain_click(task, scraped_page, page, click_action, skyvern_element)
return await normal_select(action=action, skyvern_element=skyvern_element)
LOG.info(
"Trigger custom select",
action=action,
)
timeout = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
skyvern_frame = await SkyvernFrame.create_instance(skyvern_element.get_frame())
incremental_scraped = IncrementalScrapePage(skyvern_frame=skyvern_frame)
llm_handler = app.SECONDARY_LLM_API_HANDLER
is_open = False
try:
await incremental_scraped.start_listen_dom_increment()
await skyvern_element.get_locator().focus(timeout=timeout)
if tag_name == InteractiveElement.INPUT:
await skyvern_element.get_locator().press("ArrowDown", timeout=timeout)
else:
await skyvern_element.get_locator().click(timeout=timeout)
# wait 5s for options to load
await asyncio.sleep(5)
is_open = True
incremental_element = await incremental_scraped.get_incremental_element_tree(
app.AGENT_FUNCTION.cleanup_element_tree
)
if len(incremental_element) == 0:
raise NoIncrementalElementFoundForCustomSelection(element_id=action.element_id)
dropdown_menu_element = await locate_dropdown_meanu(
incremental_scraped=incremental_scraped,
element_trees=incremental_element,
llm_handler=llm_handler,
step=step,
task=task,
)
if dropdown_menu_element and dropdown_menu_element.get_scrollable():
await scroll_down_to_load_all_options(
dropdown_menu_element=dropdown_menu_element,
skyvern_frame=skyvern_frame,
page=page,
incremental_scraped=incremental_scraped,
step=step,
task=task,
)
await incremental_scraped.get_incremental_element_tree(app.AGENT_FUNCTION.cleanup_element_tree)
# TODO: maybe take a screenshot for every tree head element to figure out which is the dropdown menu
html = incremental_scraped.build_html_tree()
target_value = action.option.label or action.option.value
if target_value is None:
raise NoLabelOrValueForCustomSelection(element_id=action.element_id)
prompt = prompt_engine.load_prompt(
"custom-select", context_reasoning=action.reasoning, target_value=target_value, elements=html
)
LOG.info(
"Calling LLM to find the match element",
target_value=target_value,
step_id=step.step_id,
task_id=task.task_id,
)
json_response = await llm_handler(prompt=prompt, step=step)
LOG.info(
"LLM response for the matched element",
target_value=target_value,
response=json_response,
step_id=step.step_id,
task_id=task.task_id,
)
element_id: str | None = json_response.get("id", None)
if not element_id:
raise NoElementMatchedForTargetOption(target=target_value, reason=json_response.get("reasoning"))
selected_element = await SkyvernElement.create_from_incremental(incremental_scraped, element_id)
await selected_element.scroll_into_view()
await selected_element.get_locator().click(timeout=timeout)
return [ActionSuccess()]
except Exception as e:
if is_open:
await skyvern_element.scroll_into_view()
await skyvern_element.coordinate_click(page=page)
await skyvern_element.get_locator().press("Escape", timeout=timeout)
LOG.exception("custome select error")
return [ActionFailure(exception=e)]
finally:
await incremental_scraped.stop_listen_dom_increment()
async def handle_checkbox_action(
@ -836,6 +827,113 @@ async def chain_click(
return [ActionFailure(WrongElementToUploadFile(action.element_id))]
async def locate_dropdown_meanu(
incremental_scraped: IncrementalScrapePage,
element_trees: list[dict],
llm_handler: LLMAPIHandler,
step: Step | None = None,
task: Task | None = None,
) -> SkyvernElement | None:
for idx, element_dict in enumerate(element_trees):
# FIXME: confirm max to 10 nodes for now, preventing sendindg too many requests to LLM
if idx >= 10:
break
element_id = element_dict.get("id")
if not element_id:
LOG.info(
"Skip the non-interactable element for the dropdown menu confirm",
step_id=step.step_id if step else "none",
task_id=task.task_id if task else "none",
element=element_dict,
)
continue
head_element = await SkyvernElement.create_from_incremental(incremental_scraped, element_id)
screenshot = await head_element.get_locator().screenshot(
timeout=SettingsManager.get_settings().BROWSER_SCREENSHOT_TIMEOUT_MS
)
dropdown_confirm_prompt = prompt_engine.load_prompt("opened-dropdown-confirm")
LOG.info(
"Confirm if it's an opened dropdown menu",
step_id=step.step_id if step else "none",
task_id=task.task_id if task else "none",
element=element_dict,
)
json_response = await llm_handler(prompt=dropdown_confirm_prompt, screenshots=[screenshot], step=step)
is_opened_dropdown_menu = json_response.get("is_opened_dropdown_menu")
if is_opened_dropdown_menu:
return await SkyvernElement.create_from_incremental(incre_page=incremental_scraped, element_id=element_id)
return None
async def scroll_down_to_load_all_options(
dropdown_menu_element: SkyvernElement,
page: Page,
skyvern_frame: SkyvernFrame,
incremental_scraped: IncrementalScrapePage,
step: Step | None = None,
task: Task | None = None,
) -> None:
LOG.info(
"Scroll down the dropdown menu to load all options",
step_id=step.step_id if step else "none",
task_id=task.task_id if task else "none",
)
timeout = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
dropdown_menu_element_handle = await dropdown_menu_element.get_locator().element_handle(timeout=timeout)
if dropdown_menu_element_handle is None:
LOG.info("element handle is None, using focus to move the cursor", element_id=dropdown_menu_element.get_id())
await dropdown_menu_element.get_locator().focus(timeout=timeout)
else:
await dropdown_menu_element_handle.scroll_into_view_if_needed(timeout=timeout)
await dropdown_menu_element.move_mouse_to(page=page)
scroll_pace = 0
previous_num = await incremental_scraped.get_incremental_elements_num()
deadline = datetime.now(timezone.utc) + timedelta(
milliseconds=SettingsManager.get_settings().OPTION_LOADING_TIMEOUT_MS
)
while datetime.now(timezone.utc) < deadline:
# make sure we can scroll to the bottom
scroll_interval = SettingsManager.get_settings().BROWSER_HEIGHT * 5
if dropdown_menu_element_handle is None:
LOG.info("element handle is None, using mouse to scroll down", element_id=dropdown_menu_element.get_id())
await page.mouse.wheel(0, scroll_interval)
scroll_pace += scroll_interval
else:
await skyvern_frame.scroll_to_element_bottom(dropdown_menu_element_handle)
# scoll a little back and scoll down to trigger the loading
await page.mouse.wheel(0, -20)
await page.mouse.wheel(0, 20)
# wait for while to load new options
await asyncio.sleep(5)
current_num = await incremental_scraped.get_incremental_elements_num()
LOG.info(
"Current incremental elements count during the scrolling",
num=current_num,
step_id=step.step_id if step else "none",
task_id=task.task_id if task else "none",
)
if previous_num == current_num:
break
previous_num = current_num
else:
LOG.warning("Timeout to load all options, maybe some options will be missed")
# scoll back to the start point and wait for a while to make all options invisible on the page
if dropdown_menu_element_handle is None:
LOG.info("element handle is None, using mouse to scroll back", element_id=dropdown_menu_element.get_id())
await page.mouse.wheel(0, -scroll_pace)
else:
await skyvern_frame.scroll_to_element_top(dropdown_menu_element_handle)
await asyncio.sleep(5)
async def normal_select(
action: actions.SelectOptionAction,
skyvern_element: SkyvernElement,

View file

@ -386,19 +386,8 @@ function isInteractable(element) {
return true;
}
if (
tagName === "div" ||
tagName === "img" ||
tagName === "span" ||
tagName === "a" ||
tagName === "i"
) {
const computedStyle = window.getComputedStyle(element);
const hasPointer = computedStyle.cursor === "pointer";
return hasPointer;
}
// support listbox and options underneath it
// div element should be checked here before the css pointer
if (
(tagName === "ul" || tagName === "div") &&
element.hasAttribute("role") &&
@ -414,9 +403,53 @@ function isInteractable(element) {
return true;
}
if (
tagName === "div" &&
element.hasAttribute("aria-disabled") &&
element.getAttribute("aria-disabled").toLowerCase() === "false"
) {
return true;
}
if (
tagName === "div" ||
tagName === "img" ||
tagName === "span" ||
tagName === "a" ||
tagName === "i"
) {
const computedStyle = window.getComputedStyle(element);
const hasPointer = computedStyle.cursor === "pointer";
return hasPointer;
}
return false;
}
function isScrollable(element) {
const scrollHeight = element.scrollHeight || 0;
const clientHeight = element.clientHeight || 0;
const scrollWidth = element.scrollWidth || 0;
const clientWidth = element.clientWidth || 0;
const hasScrollableContent =
scrollHeight > clientHeight || scrollWidth > clientWidth;
const hasScrollableOverflow = isScrollableOverflow(element);
return hasScrollableContent && hasScrollableOverflow;
}
function isScrollableOverflow(element) {
const style = window.getComputedStyle(element);
return (
style.overflow === "auto" ||
style.overflow === "scroll" ||
style.overflowX === "auto" ||
style.overflowX === "scroll" ||
style.overflowY === "auto" ||
style.overflowY === "scroll"
);
}
const isComboboxDropdown = (element) => {
if (element.tagName.toLowerCase() !== "input") {
return false;
@ -436,8 +469,8 @@ const isComboboxDropdown = (element) => {
const isSelect2Dropdown = (element) => {
return (
element.tagName.toLowerCase() === "span" &&
element.className.toString().includes("select2-chosen")
element.tagName.toLowerCase() === "a" &&
element.className.toString().includes("select2-choice")
);
};
@ -805,6 +838,14 @@ function uniqueId() {
}
async function buildTreeFromBody(frame = "main.frame", open_select = false) {
return buildElementTree(document.body, frame, open_select);
}
async function buildElementTree(
starter = document.body,
frame = "main.frame",
open_select = false,
) {
var elements = [];
var resultArray = [];
@ -863,6 +904,13 @@ async function buildTreeFromBody(frame = "main.frame", open_select = false) {
// don't trim any attr of this element if keepAllAttr=True
keepAllAttr:
elementTagNameLower === "svg" || element.closest("svg") !== null,
isSelectable:
elementTagNameLower === "select" ||
isReactSelectDropdown(element) ||
isComboboxDropdown(element) ||
isSelect2Dropdown(element) ||
isSelect2MultiChoice(element),
isScrollable: isScrollable(element),
};
let isInShadowRoot = element.getRootNode() instanceof ShadowRoot;
@ -882,94 +930,8 @@ async function buildTreeFromBody(frame = "main.frame", open_select = false) {
let selectedValue = "";
if (elementTagNameLower === "select") {
[selectOptions, selectedValue] = getSelectOptions(element);
} else if (attrs["role"] && attrs["role"].toLowerCase() === "listbox") {
// if "role" key is inside attrs, then get all the elements with role "option" and get their text
selectOptions = getListboxOptions(element);
} else if (open_select && isReactSelectDropdown(element)) {
element.dispatchEvent(
new MouseEvent("mouseup", {
bubbles: true,
view: window,
}),
);
element.dispatchEvent(
new MouseEvent("mousedown", {
bubbles: true,
view: window,
}),
);
selectOptions = await getReactSelectOptions(element);
// click again to close
element.dispatchEvent(
new MouseEvent("mouseup", {
bubbles: true,
view: window,
}),
);
element.dispatchEvent(
new MouseEvent("mousedown", {
bubbles: true,
view: window,
}),
);
element.dispatchEvent(
new KeyboardEvent("keydown", {
keyCode: 27,
bubbles: true,
key: "Escape",
}),
);
} else if (open_select && isComboboxDropdown(element)) {
// open combobox dropdown to get options
element.click();
const listBox = element
.getRootNode()
.getElementById(element.getAttribute("aria-controls"));
if (listBox) {
selectOptions = getListboxOptions(listBox);
}
// HACK: press Tab to close the dropdown
element.dispatchEvent(
new KeyboardEvent("keydown", {
keyCode: 9,
bubbles: true,
key: "Tab",
}),
);
} else if (open_select && isSelect2Dropdown(element)) {
// click element to show options
element.dispatchEvent(
new MouseEvent("mousedown", {
bubbles: true,
view: window,
}),
);
selectOptions = await getSelect2Options(element);
// HACK: click again to close the dropdown
element.dispatchEvent(
new MouseEvent("mousedown", {
bubbles: true,
view: window,
}),
);
} else if (open_select && isSelect2MultiChoice(element)) {
// click element to show options
element.click();
selectOptions = await getSelect2Options(element);
// HACK: press ESC to close the dropdown
element.dispatchEvent(
new KeyboardEvent("keydown", {
keyCode: 27,
bubbles: true,
key: "Escape",
}),
);
}
if (selectOptions) {
elementObj.options = selectOptions;
}
@ -1308,9 +1270,8 @@ async function buildTreeFromBody(frame = "main.frame", open_select = false) {
return trimmedResults;
};
// TODO: Handle iframes
// setup before parsing the dom
await processElement(document.body, null);
await processElement(starter, null);
for (var element of elements) {
if (
@ -1568,6 +1529,22 @@ async function scrollToNextPage(draw_boxes) {
return window.scrollY;
}
function scrollToElementBottom(element) {
element.scroll({
top: element.scrollHeight,
left: 0,
behavior: "instant",
});
}
function scrollToElementTop(element) {
element.scroll({
top: 0,
left: 0,
behavior: "instant",
});
}
async function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
@ -1589,3 +1566,140 @@ function findNodeById(arr, targetId, path = []) {
}
return null;
}
function getElementDomDepth(elementNode) {
let depth = 0;
const rootElement = elementNode.getRootNode().firstElementChild;
while (elementNode !== rootElement && elementNode.parentElement) {
depth++;
elementNode = elementNode.parentElement;
}
return depth;
}
if (window.globalOneTimeIncrementElements === undefined) {
window.globalOneTimeIncrementElements = [];
}
if (window.globalObserverForDOMIncrement === undefined) {
window.globalObserverForDOMIncrement = new MutationObserver(function (
mutationsList,
observer,
) {
for (const mutation of mutationsList) {
if (mutation.type === "attributes") {
if (mutation.attributeName === "style") {
// TODO: need to confirm that elemnent is hidden previously
node = mutation.target;
if (node.nodeType === Node.TEXT_NODE) continue;
const newStyle = window.getComputedStyle(node);
const newDisplay = newStyle.display;
if (newDisplay !== "none") {
window.globalOneTimeIncrementElements.push({
targetNode: node,
newNodes: [node],
});
}
}
// TODO: we maybe need to detect the visiblity change from class
// if (mutation.attributeName === "class") {
// }
}
if (mutation.type === "childList") {
let changedNode = {
targetNode: mutation.target, // TODO: for future usage, when we want to parse new elements into a tree
};
let newNodes = [];
if (mutation.addedNodes && mutation.addedNodes.length > 0) {
for (const node of mutation.addedNodes) {
// skip the text nodes, they won't be interactable
if (node.nodeType === Node.TEXT_NODE) continue;
newNodes.push(node);
}
}
if (newNodes.length > 0) {
changedNode.newNodes = newNodes;
window.globalOneTimeIncrementElements.push(changedNode);
}
}
}
});
}
function startGlobalIncrementalObserver() {
window.globalOneTimeIncrementElements = [];
window.globalObserverForDOMIncrement.takeRecords(); // cleanup the older data
window.globalObserverForDOMIncrement.observe(document.body, {
attributes: true,
attributeOldValue: true,
childList: true,
subtree: true,
characterData: true,
});
}
function stopGlobalIncrementalObserver() {
window.globalObserverForDOMIncrement.disconnect();
window.globalObserverForDOMIncrement.takeRecords(); // cleanup the older data
window.globalOneTimeIncrementElements = [];
}
async function getIncrementElements(frame) {
const domDepthMap = new Map();
for (const element of window.globalOneTimeIncrementElements) {
// calculate the depth of targetNode element for sorting
const depth = getElementDomDepth(element.targetNode);
let newNodesTreeList = [];
if (domDepthMap.has(depth)) {
newNodesTreeList = domDepthMap.get(depth);
}
for (const child of element.newNodes) {
const [_, newNodeTree] = await buildElementTree(child, frame, false);
if (newNodeTree.length > 0) {
newNodesTreeList.push(...newNodeTree);
}
}
domDepthMap.set(depth, newNodesTreeList);
}
// cleanup the chidren tree, remove the duplicated element
// search starting from the shallowest node:
// 1. if deeper, the node could only be the children of the shallower one or no related one.
// 2. if depth is same, the node could only be duplicated one or no related one.
const idToElement = new Map();
const cleanedTreeList = [];
const sortedDepth = Array.from(domDepthMap.keys()).sort();
for (let idx = 0; idx < sortedDepth.length; idx++) {
const depth = sortedDepth[idx];
const treeList = domDepthMap.get(depth);
for (const treeHeadElement of treeList) {
// check if the element is existed
if (idToElement.has(treeHeadElement.id)) {
continue;
}
cleanedTreeList.push(treeHeadElement);
// flatten the tree
let pendingElements = [treeHeadElement];
let curIndex = 0;
while (curIndex < pendingElements.length) {
const curElement = pendingElements[curIndex];
if (idToElement.has(curElement.id)) {
curIndex++;
continue;
}
idToElement.set(curElement.id, curElement);
pendingElements.push(...curElement.children);
curIndex++;
}
}
}
return [Array.from(idToElement.values()), cleanedTreeList];
}

View file

@ -96,6 +96,9 @@ def json_to_html(element: dict) -> str:
attributes_html = " ".join(build_attribute(key, value) for key, value in attributes.items())
tag = element["tagName"]
if element.get("isSelectable", False):
tag = "select"
text = element.get("text", "")
# build children HTML
children_html = "".join(json_to_html(child) for child in element.get("children", []))
@ -112,6 +115,21 @@ def json_to_html(element: dict) -> str:
return f'<{tag}{attributes_html if not attributes_html else " "+attributes_html}>{text}{children_html+option_html}</{tag}>'
def build_element_dict(elements: list[dict]) -> tuple[dict[str, str], dict[str, dict], dict[str, str]]:
id_to_css_dict: dict[str, str] = {}
id_to_element_dict: dict[str, dict] = {}
id_to_frame_dict: dict[str, str] = {}
for element in elements:
element_id: str = element.get("id", "")
# get_interactable_element_tree marks each interactable element with a unique_id attribute
id_to_css_dict[element_id] = f"[{SKYVERN_ID_ATTR}='{element_id}']"
id_to_element_dict[element_id] = element
id_to_frame_dict[element_id] = element["frame"]
return id_to_css_dict, id_to_element_dict, id_to_frame_dict
class ElementTreeFormat(StrEnum):
JSON = "json"
HTML = "html"
@ -266,16 +284,7 @@ async def scrape_web_unsafe(
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
element_tree = await cleanup_element_tree(url, copy.deepcopy(element_tree))
id_to_css_dict = {}
id_to_element_dict = {}
id_to_frame_dict = {}
for element in elements:
element_id = element["id"]
# get_interactable_element_tree marks each interactable element with a unique_id attribute
id_to_css_dict[element_id] = f"[{SKYVERN_ID_ATTR}='{element_id}']"
id_to_element_dict[element_id] = element
id_to_frame_dict[element_id] = element["frame"]
id_to_css_dict, id_to_element_dict, id_to_frame_dict = build_element_dict(elements)
text_content = await get_frame_text(page.main_frame)
@ -378,6 +387,65 @@ async def get_interactable_element_tree(
return elements, element_tree
class IncrementalScrapePage:
id_to_element_dict: dict[str, dict] = {}
id_to_css_dict: dict[str, str]
elements: list[dict]
element_tree: list[dict]
element_tree_trimmed: list[dict]
def __init__(self, skyvern_frame: SkyvernFrame) -> None:
self.skyvern_frame = skyvern_frame
async def get_incremental_element_tree(
self,
cleanup_element_tree: Callable[[str, list[dict]], Awaitable[list[dict]]],
) -> list[dict]:
frame = self.skyvern_frame.get_frame()
frame_id = "main.frame"
if isinstance(frame, Frame):
try:
frame_element = await frame.frame_element()
frame_id = await frame_element.get_attribute("unique_id")
except Exception:
# TODO: do we really care about the frame_id ?
LOG.warning(
"Unable to get frame_element",
exc_info=True,
)
js_script = f"async () => await getIncrementElements('{frame_id}')"
incremental_elements, incremental_tree = await frame.evaluate(js_script)
# we listen the incremental elements seperated by frames, so all elements will be in the same SkyvernFrame
self.id_to_css_dict, self.id_to_element_dict, _ = build_element_dict(incremental_elements)
self.elements = incremental_elements
incremental_tree = await cleanup_element_tree(frame.url, copy.deepcopy(incremental_tree))
trimmed_element_tree = trim_element_tree(copy.deepcopy(incremental_tree))
self.element_tree = incremental_tree
self.element_tree_trimmed = trimmed_element_tree
return self.element_tree_trimmed
async def start_listen_dom_increment(self) -> None:
js_script = "() => startGlobalIncrementalObserver()"
await self.skyvern_frame.get_frame().evaluate(js_script)
async def stop_listen_dom_increment(self) -> None:
js_script = "() => stopGlobalIncrementalObserver()"
await self.skyvern_frame.get_frame().evaluate(js_script)
async def get_incremental_elements_num(self) -> int:
js_script = "() => window.globalOneTimeIncrementElements.length"
return await self.skyvern_frame.get_frame().evaluate(js_script)
def build_html_tree(self) -> str:
return "".join([json_to_html(element) for element in self.element_tree_trimmed])
def trim_element_tree(elements: list[dict]) -> list[dict]:
queue = []
for element in elements:

View file

@ -1,8 +1,10 @@
from __future__ import annotations
import asyncio
import typing
from abc import ABC, abstractmethod
from enum import StrEnum
from random import uniform
import structlog
from playwright.async_api import Frame, FrameLocator, Locator, Page
@ -21,11 +23,12 @@ from skyvern.exceptions import (
MultipleDropdownAnchorErr,
MultipleElementsFound,
NoDropdownAnchorErr,
NoElementBoudingBox,
NoneFrameError,
SkyvernException,
)
from skyvern.forge.sdk.settings_manager import SettingsManager
from skyvern.webeye.scraper.scraper import ScrapedPage
from skyvern.webeye.scraper.scraper import IncrementalScrapePage, ScrapedPage
from skyvern.webeye.utils.page import SkyvernFrame
LOG = structlog.get_logger()
@ -94,6 +97,35 @@ class SkyvernElement:
When you try to interact with these elements by python, you are supposed to use this class as an interface.
"""
@classmethod
async def create_from_incremental(cls, incre_page: IncrementalScrapePage, element_id: str) -> SkyvernElement:
element_dict = incre_page.id_to_element_dict.get(element_id)
if element_dict is None:
raise MissingElementDict(element_id)
css_selector = incre_page.id_to_css_dict.get(element_id)
if not css_selector:
raise MissingElementInCSSMap(element_id)
frame = incre_page.skyvern_frame.get_frame()
locator = frame.locator(css_selector)
num_elements = await locator.count()
if num_elements < 1:
LOG.warning("No elements found with css. Validation failed.", css=css_selector, element_id=element_id)
raise MissingElement(selector=css_selector, element_id=element_id)
elif num_elements > 1:
LOG.warning(
"Multiple elements found with css. Expected 1. Validation failed.",
num_elements=num_elements,
selector=css_selector,
element_id=element_id,
)
raise MultipleElementsFound(num=num_elements, selector=css_selector, element_id=element_id)
return cls(locator, frame, element_dict)
def __init__(self, locator: Locator, frame: Page | Frame, static_element: dict) -> None:
self.__static_element = static_element
self.__frame = frame
@ -147,12 +179,13 @@ class SkyvernElement:
return self.__static_element.get("interactable", False)
async def is_selectable(self) -> bool:
return (
await self.is_select2_dropdown()
or await self.is_react_select_dropdown()
or await self.is_combobox_dropdown()
or self.get_tag_name() in SELECTABLE_ELEMENT
)
return self.get_selectable() or self.get_tag_name() in SELECTABLE_ELEMENT
def get_scrollable(self) -> bool:
return self.__static_element.get("isScrollable", False)
def get_selectable(self) -> bool:
return self.__static_element.get("isSelectable", False)
def get_tag_name(self) -> str:
return self.__static_element.get("tagName", "")
@ -294,6 +327,36 @@ class SkyvernElement:
async def input_clear(self, timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS) -> None:
await self.get_locator().clear(timeout=timeout)
async def move_mouse_to(
self, page: Page, timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
) -> tuple[float, float]:
bounding_box = await self.get_locator().bounding_box(timeout=timeout)
if not bounding_box:
raise NoElementBoudingBox(element_id=self.get_id())
x, y, width, height = bounding_box["x"], bounding_box["y"], bounding_box["width"], bounding_box["height"]
# calculate the click point, use open interval to avoid clicking on the border
epsilon = 0.01
dest_x = uniform(x + epsilon, x + width - epsilon) if width > 2 * epsilon else (x + width) / 2
dest_y = uniform(y + epsilon, y + height - epsilon) if height > 2 * epsilon else (y + height) / 2
await page.mouse.move(dest_x, dest_y)
return dest_x, dest_y
async def coordinate_click(
self, page: Page, timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
) -> None:
click_x, click_y = await self.move_mouse_to(page=page, timeout=timeout)
await page.mouse.click(click_x, click_y)
async def scroll_into_view(self, timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS) -> None:
element_handler = await self.get_locator().element_handle()
if element_handler is None:
LOG.warning("element handler is None. ", element_id=self.get_id())
return
await element_handler.scroll_into_view_if_needed(timeout=timeout)
await asyncio.sleep(2) # wait for scrolling into the target
class DomUtil:
"""

View file

@ -145,6 +145,14 @@ class SkyvernFrame:
async with asyncio.timeout(timeout):
return await self.frame.content()
async def scroll_to_element_bottom(self, element: ElementHandle) -> None:
js_script = "(element) => scrollToElementBottom(element)"
return await self.frame.evaluate(js_script, element)
async def scroll_to_element_top(self, element: ElementHandle) -> None:
js_script = "(element) => scrollToElementTop(element)"
return await self.frame.evaluate(js_script, element)
async def get_select2_options(self, element: ElementHandle) -> List[Dict[str, Any]]:
await self.frame.evaluate(JS_FUNCTION_DEFS)
js_script = "async (element) => await getSelect2Options(element)"