remove the old context tree (#268)

This commit is contained in:
LawyZheng 2024-05-08 10:16:30 +08:00 committed by GitHub
parent bce6326eef
commit 8d87e71891
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 10 additions and 20 deletions

View file

@ -136,13 +136,11 @@ class BrowserState:
browser_context: BrowserContext | None = None, browser_context: BrowserContext | None = None,
page: Page | None = None, page: Page | None = None,
browser_artifacts: BrowserArtifacts = BrowserArtifacts(), browser_artifacts: BrowserArtifacts = BrowserArtifacts(),
new_context_tree: bool = False,
): ):
self.pw = pw self.pw = pw
self.browser_context = browser_context self.browser_context = browser_context
self.page = page self.page = page
self.browser_artifacts = browser_artifacts self.browser_artifacts = browser_artifacts
self.new_context_tree = new_context_tree
async def _close_all_other_pages(self) -> None: async def _close_all_other_pages(self) -> None:
if not self.browser_context or not self.page: if not self.browser_context or not self.page:

View file

@ -25,7 +25,6 @@ class BrowserManager:
async def _create_browser_state( async def _create_browser_state(
proxy_location: ProxyLocation | None = None, proxy_location: ProxyLocation | None = None,
url: str | None = None, url: str | None = None,
new_context_tree: bool = False,
task_id: str | None = None, task_id: str | None = None,
) -> BrowserState: ) -> BrowserState:
pw = await async_playwright().start() pw = await async_playwright().start()
@ -40,7 +39,6 @@ class BrowserManager:
browser_context=browser_context, browser_context=browser_context,
page=None, page=None,
browser_artifacts=browser_artifacts, browser_artifacts=browser_artifacts,
new_context_tree=new_context_tree,
) )
async def get_or_create_for_task(self, task: Task) -> BrowserState: async def get_or_create_for_task(self, task: Task) -> BrowserState:
@ -55,9 +53,8 @@ class BrowserManager:
self.pages[task.task_id] = self.pages[task.workflow_run_id] self.pages[task.task_id] = self.pages[task.workflow_run_id]
return self.pages[task.task_id] return self.pages[task.task_id]
new_ctx = True LOG.info("Creating browser state for task", task_id=task.task_id)
LOG.info("Creating browser state for task", task_id=task.task_id, new_ctx=new_ctx) browser_state = await self._create_browser_state(task.proxy_location, task.url, task.task_id)
browser_state = await self._create_browser_state(task.proxy_location, task.url, new_ctx, task.task_id)
# The URL here is only used when creating a new page, and not when using an existing page. # The URL here is only used when creating a new page, and not when using an existing page.
# This will make sure browser_state.page is not None. # This will make sure browser_state.page is not None.

View file

@ -544,7 +544,7 @@ function getListboxOptions(element) {
return selectOptions; return selectOptions;
} }
function buildTreeFromBody(new_ctx = false) { function buildTreeFromBody() {
var elements = []; var elements = [];
var resultArray = []; var resultArray = [];
@ -634,7 +634,6 @@ function buildTreeFromBody(new_ctx = false) {
} }
if ( if (
new_ctx &&
checkRequiredFromStyle(element) && checkRequiredFromStyle(element) &&
!attrs["required"] && !attrs["required"] &&
!attrs["aria-required"] !attrs["aria-required"]
@ -712,7 +711,7 @@ function buildTreeFromBody(new_ctx = false) {
elements[interactableParentId].children.push(elementObj); elements[interactableParentId].children.push(elementObj);
} }
// options already added to the select.options, no need to add options anymore // options already added to the select.options, no need to add options anymore
if (new_ctx && elementObj.options && elementObj.options.length > 0) { if (elementObj.options && elementObj.options.length > 0) {
return elementObj; return elementObj;
} }
// Recursively process the children of the element // Recursively process the children of the element
@ -744,7 +743,7 @@ function buildTreeFromBody(new_ctx = false) {
if (parentEle) { if (parentEle) {
if ( if (
targetParentElements.has(parentEle.tagName.toLowerCase()) || targetParentElements.has(parentEle.tagName.toLowerCase()) ||
(new_ctx && checkParentClass(parentEle.className.toLowerCase())) checkParentClass(parentEle.className.toLowerCase())
) { ) {
targetContextualParent = parentEle; targetContextualParent = parentEle;
} }
@ -939,7 +938,7 @@ function buildTreeFromBody(new_ctx = false) {
element.context = context; element.context = context;
} }
if (new_ctx && checkStringIncludeRequire(context)) { if (checkStringIncludeRequire(context)) {
if ( if (
!element.attributes["required"] && !element.attributes["required"] &&
!element.attributes["aria-required"] !element.attributes["aria-required"]
@ -949,10 +948,6 @@ function buildTreeFromBody(new_ctx = false) {
} }
} }
if (!new_ctx) {
return [elements, resultArray];
}
resultArray = removeOrphanNode(resultArray); resultArray = removeOrphanNode(resultArray);
resultArray.forEach((root) => { resultArray.forEach((root) => {
trimDuplicatedText(root); trimDuplicatedText(root);

View file

@ -184,7 +184,7 @@ async def scrape_web_unsafe(
await remove_bounding_boxes(page) await remove_bounding_boxes(page)
await scroll_to_top(page, drow_boxes=False) await scroll_to_top(page, drow_boxes=False)
elements, element_tree = await get_interactable_element_tree(page, browser_state.new_context_tree) elements, element_tree = await get_interactable_element_tree(page)
element_tree = cleanup_elements(copy.deepcopy(element_tree)) element_tree = cleanup_elements(copy.deepcopy(element_tree))
_build_element_links(elements) _build_element_links(elements)
@ -211,15 +211,15 @@ async def scrape_web_unsafe(
) )
async def get_interactable_element_tree(page: Page, new_context_tree: bool) -> tuple[list[dict], list[dict]]: async def get_interactable_element_tree(page: Page) -> tuple[list[dict], list[dict]]:
""" """
Get the element tree of the page, including all the elements that are interactable. Get the element tree of the page, including all the elements that are interactable.
:param page: Page instance to get the element tree from. :param page: Page instance to get the element tree from.
:return: Tuple containing the element tree and a map of element IDs to elements. :return: Tuple containing the element tree and a map of element IDs to elements.
""" """
await page.evaluate(JS_FUNCTION_DEFS) await page.evaluate(JS_FUNCTION_DEFS)
js_script = "(new_ctx) => buildTreeFromBody(new_ctx)" js_script = "() => buildTreeFromBody()"
elements, element_tree = await page.evaluate(js_script, new_context_tree) elements, element_tree = await page.evaluate(js_script)
return elements, element_tree return elements, element_tree