Skyvern/skyvern/webeye/real_browser_state.py

from __future__ import annotations

import asyncio
import random
import time
from typing import Literal
from urllib.parse import urlparse

import structlog
from playwright.async_api import BrowserContext, Page, Playwright

from skyvern.config import settings
from skyvern.constants import BROWSER_CLOSE_TIMEOUT, BROWSER_PAGE_CLOSE_TIMEOUT, NAVIGATION_MAX_RETRY_TIME
from skyvern.exceptions import (
    EmptyBrowserContext,
    FailedToNavigateToUrl,
    FailedToReloadPage,
    FailedToStopLoadingPage,
    MissingBrowserStatePage,
)
from skyvern.forge.sdk.trace import traced
from skyvern.schemas.runs import ProxyLocationInput
from skyvern.webeye.browser_artifacts import BrowserArtifacts, VideoArtifact
from skyvern.webeye.browser_factory import BrowserCleanupFunc, BrowserContextFactory
from skyvern.webeye.browser_state import BrowserState
from skyvern.webeye.navigation import is_permanent_navigation_error, navigate_with_retry
from skyvern.webeye.scraper import scraper
from skyvern.webeye.scraper.scraped_page import CleanupElementTreeFunc, ScrapedPage, ScrapeExcludeFunc
from skyvern.webeye.utils.page import ScreenshotMode, SkyvernFrame

LOG = structlog.get_logger()

SETTLE_TIME_MS = 750
SETTLE_JITTER_MS = 500


class RealBrowserState(BrowserState):
    def __init__(
        self,
        pw: Playwright,
        browser_context: BrowserContext | None = None,
        page: Page | None = None,
        browser_artifacts: BrowserArtifacts = BrowserArtifacts(),
        browser_cleanup: BrowserCleanupFunc = None,
    ):
        self.__page = page
        self.pw = pw
        self.browser_context = browser_context
        self.browser_artifacts = browser_artifacts
        self.browser_cleanup = browser_cleanup

    async def __assert_page(self) -> Page:
        page = await self.get_working_page()
        if page is not None:
            return page
        pages = (self.browser_context.pages or []) if self.browser_context else []
        LOG.error("BrowserState has no page", urls=[p.url for p in pages])
        raise MissingBrowserStatePage()

    async def _close_all_other_pages(self) -> None:
        cur_page = await self.get_working_page()
        if not self.browser_context or not cur_page:
            return
        pages = self.browser_context.pages
        for page in pages:
            if page != cur_page:
                try:
                    async with asyncio.timeout(2):
                        await page.close()
                except asyncio.TimeoutError:
                    LOG.warning("Timeout to close the page. Skip closing the page", url=page.url)
                except Exception:
                    LOG.exception("Error while closing the page", url=page.url)

    async def check_and_fix_state(
        self,
        url: str | None = None,
        proxy_location: ProxyLocationInput = None,
        task_id: str | None = None,
        workflow_run_id: str | None = None,
        workflow_permanent_id: str | None = None,
        script_id: str | None = None,
        organization_id: str | None = None,
        extra_http_headers: dict[str, str] | None = None,
        browser_address: str | None = None,
        browser_profile_id: str | None = None,
    ) -> None:
        if self.browser_context is None:
            LOG.info("creating browser context")
            (
                browser_context,
                browser_artifacts,
                browser_cleanup,
            ) = await BrowserContextFactory.create_browser_context(
                self.pw,
                url=url,
                proxy_location=proxy_location,
                task_id=task_id,
                workflow_run_id=workflow_run_id,
                workflow_permanent_id=workflow_permanent_id,
                script_id=script_id,
                organization_id=organization_id,
                extra_http_headers=extra_http_headers,
                browser_address=browser_address,
                browser_profile_id=browser_profile_id,
            )
            self.browser_context = browser_context
            self.browser_artifacts = browser_artifacts
            self.browser_cleanup = browser_cleanup
            LOG.info("browser context is created")

        if await self.get_working_page() is None:
            page: Page | None = None
            use_existing_page = False
            if browser_address and len(self.browser_context.pages) > 0:
                pages = await self.list_valid_pages()
                if len(pages) > 0:
                    page = pages[-1]
                    use_existing_page = True
            if page is None:
                page = await self.browser_context.new_page()

            await self.set_working_page(page, 0)
            if not use_existing_page:
                await self._close_all_other_pages()

            if url and page.url.rstrip("/") != url.rstrip("/"):
                await self.navigate_to_url(page=page, url=url)

    async def _wait_for_settle(self) -> None:
        total_wait_ms = SETTLE_TIME_MS
        if SETTLE_JITTER_MS > 0:
            total_wait_ms += random.randint(0, SETTLE_JITTER_MS)
        await asyncio.sleep(total_wait_ms / 1000)

    async def navigate_to_url(
        self,
        page: Page,
        url: str,
        retry_times: int = NAVIGATION_MAX_RETRY_TIME,
        wait_until: Literal["load", "domcontentloaded", "commit"] = "load",
    ) -> None:
        await navigate_with_retry(
            navigate=lambda strategy: page.goto(url, timeout=settings.BROWSER_LOADING_TIMEOUT_MS, wait_until=strategy),
            url=url,
            retry_times=retry_times,
            settle=self._wait_for_settle,
            wait_until=wait_until,
        )

    async def get_working_page(self) -> Page | None:
        # HACK: currently, assuming the last page is always the working page.
        # Need to refactor this logic when we want to manipulate multi pages together
        # TODO: do not use index of pages, it should be more robust if we want to fully support multi pages manipulation
        if self.__page is None or self.browser_context is None:
            return None

        # pick the last and http/https page as the working page
        pages = await self.list_valid_pages()
        if len(pages) == 0:
            LOG.info("No http, https or blank page found in the browser context, return None")
            return None

        last_page = pages[-1]
        if self.__page == last_page:
            return self.__page
        await self.set_working_page(last_page, len(pages) - 1)
        return last_page

    async def list_valid_pages(self, max_pages: int = settings.BROWSER_MAX_PAGES_NUMBER) -> list[Page]:
        # List all valid pages(blank page, and http/https page) in the browser context, up to max_pages
        # MSEdge CDP bug(?)
        # when using CDP connect to a MSEdge, the download hub will be included in the context.pages
        if self.browser_context is None:
            return []

        pages = [
            http_page
            for http_page in self.browser_context.pages
            if (
                http_page.url == "about:blank"
                or http_page.url == ":"  # sometimes the page url is ":", which is the blank page
                or http_page.url == "chrome-error://chromewebdata/"
                or urlparse(http_page.url).scheme in ["http", "https"]
            )
        ]

        if max_pages <= 0 or len(pages) <= max_pages:
            return pages

        reserved_pages = pages[-max_pages:]

        closing_pages = pages[: len(pages) - max_pages]
        LOG.warning(
            "The page number exceeds the limit, closing the oldest pages. It might cause the video missing",
            closing_pages=closing_pages,
        )
        for page in closing_pages:
            try:
                async with asyncio.timeout(BROWSER_PAGE_CLOSE_TIMEOUT):
                    await page.close()
            except Exception:
                LOG.warning("Error while closing the page", exc_info=True)

        return reserved_pages

    async def validate_browser_context(self, page: Page) -> bool:
        # validate the content
        try:
            skyvern_frame = await SkyvernFrame.create_instance(frame=page)
            html = await skyvern_frame.get_content()
        except Exception:
            LOG.error(
                "Error happened while getting the first page content",
                exc_info=True,
            )
            return False

        if "Bad gateway error" in html:
            LOG.warning("Bad gateway error on the page, recreate a new browser context with another proxy node")
            return False

        if "client_connect_forbidden_host" in html:
            LOG.warning(
                "capture the client_connect_forbidden_host error on the page, recreate a new browser context with another proxy node"
            )
            return False

        return True

    async def must_get_working_page(self) -> Page:
        page = await self.get_working_page()
        if page is None:
            raise MissingBrowserStatePage()
        return page

    async def set_working_page(self, page: Page | None, index: int = 0) -> None:
        self.__page = page
        if page is None:
            return
        if len(self.browser_artifacts.video_artifacts) > index:
            if self.browser_artifacts.video_artifacts[index].video_path is None:
                try:
                    async with asyncio.timeout(settings.BROWSER_ACTION_TIMEOUT_MS / 1000):
                        if page.video:
                            self.browser_artifacts.video_artifacts[index].video_path = await page.video.path()
                except asyncio.TimeoutError:
                    LOG.info("Timeout to get the page video, skip the exception")
                except Exception:
                    LOG.exception("Error while getting the page video", exc_info=True)
            return

        target_length = index + 1
        self.browser_artifacts.video_artifacts.extend(
            [VideoArtifact()] * (target_length - len(self.browser_artifacts.video_artifacts))
        )
        try:
            async with asyncio.timeout(settings.BROWSER_ACTION_TIMEOUT_MS / 1000):
                if page.video:
                    self.browser_artifacts.video_artifacts[index].video_path = await page.video.path()
        except asyncio.TimeoutError:
            LOG.info("Timeout to get the page video, skip the exception")
        except Exception:
            LOG.exception("Error while getting the page video", exc_info=True)
        return

    async def get_or_create_page(
        self,
        url: str | None = None,
        proxy_location: ProxyLocationInput = None,
        task_id: str | None = None,
        workflow_run_id: str | None = None,
        workflow_permanent_id: str | None = None,
        script_id: str | None = None,
        organization_id: str | None = None,
        extra_http_headers: dict[str, str] | None = None,
        browser_address: str | None = None,
        browser_profile_id: str | None = None,
    ) -> Page:
        page = await self.get_working_page()
        if page is not None:
            return page

        try:
            await self.check_and_fix_state(
                url=url,
                proxy_location=proxy_location,
                task_id=task_id,
                workflow_run_id=workflow_run_id,
                workflow_permanent_id=workflow_permanent_id,
                script_id=script_id,
                organization_id=organization_id,
                extra_http_headers=extra_http_headers,
                browser_address=browser_address,
                browser_profile_id=browser_profile_id,
            )
        except Exception as e:
            error_message = e.error_message if isinstance(e, FailedToNavigateToUrl) else str(e)
            if is_permanent_navigation_error(error_message):
                raise
            if "net::ERR" not in error_message:
                raise
            if not await self.close_current_open_page():
                LOG.warning("Failed to close the current open page")
                raise
            await self.check_and_fix_state(
                url=url,
                proxy_location=proxy_location,
                task_id=task_id,
                workflow_run_id=workflow_run_id,
                workflow_permanent_id=workflow_permanent_id,
                script_id=script_id,
                organization_id=organization_id,
                extra_http_headers=extra_http_headers,
                browser_address=browser_address,
                browser_profile_id=browser_profile_id,
            )
        page = await self.__assert_page()

        if not await self.validate_browser_context(await self.get_working_page()):
            if not await self.close_current_open_page():
                LOG.warning("Failed to close the current open page, going to skip the browser context validation")
                return page
            await self.check_and_fix_state(
                url=url,
                proxy_location=proxy_location,
                task_id=task_id,
                workflow_run_id=workflow_run_id,
                workflow_permanent_id=workflow_permanent_id,
                script_id=script_id,
                organization_id=organization_id,
                extra_http_headers=extra_http_headers,
                browser_address=browser_address,
                browser_profile_id=browser_profile_id,
            )
            page = await self.__assert_page()
        return page

    async def close_current_open_page(self) -> bool:
        try:
            async with asyncio.timeout(BROWSER_CLOSE_TIMEOUT):
                await self._close_all_other_pages()
                if self.browser_context is not None:
                    await self.browser_context.close()
                self.browser_context = None
                await self.set_working_page(None)
                return True
        except Exception:
            LOG.warning("Error while closing the current open page", exc_info=True)
            return False

    async def stop_page_loading(self) -> None:
        page = await self.__assert_page()
        try:
            await SkyvernFrame.evaluate(frame=page, expression="window.stop()")
        except Exception as e:
            LOG.exception(f"Error while stop loading the page: {repr(e)}")
            raise FailedToStopLoadingPage(url=page.url, error_message=repr(e))

    async def new_page(self) -> Page:
        if self.browser_context is None:
            raise EmptyBrowserContext()
        return await self.browser_context.new_page()

    async def reload_page(self) -> None:
        page = await self.__assert_page()

        LOG.info("Reload page", url=page.url)
        try:
            start_time = time.time()
            await page.reload(timeout=settings.BROWSER_LOADING_TIMEOUT_MS)
            end_time = time.time()
            LOG.info(
                "Page loading time",
                loading_time=end_time - start_time,
            )
            await self._wait_for_settle()
        except Exception as e:
            LOG.exception(f"Error while reload url: {repr(e)}")
            raise FailedToReloadPage(url=page.url, error_message=repr(e))

    async def scrape_website(
        self,
        url: str,
        cleanup_element_tree: CleanupElementTreeFunc,
        num_retry: int = 0,
        max_retries: int = settings.MAX_SCRAPING_RETRIES,
        scrape_exclude: ScrapeExcludeFunc | None = None,
        take_screenshots: bool = True,
        draw_boxes: bool = True,
        max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
        scroll: bool = True,
        support_empty_page: bool = False,
        wait_seconds: float = 0,
        must_included_tags: list[str] | None = None,
    ) -> ScrapedPage:
        return await scraper.scrape_website(
            browser_state=self,
            url=url,
            cleanup_element_tree=cleanup_element_tree,
            num_retry=num_retry,
            max_retries=max_retries,
            scrape_exclude=scrape_exclude,
            take_screenshots=take_screenshots,
            draw_boxes=draw_boxes,
            max_screenshot_number=max_screenshot_number,
            scroll=scroll,
            support_empty_page=support_empty_page,
            wait_seconds=wait_seconds,
            must_included_tags=must_included_tags,
        )

    async def close(self, close_browser_on_completion: bool = True) -> None:
        LOG.info("Closing browser state")
        try:
            async with asyncio.timeout(BROWSER_CLOSE_TIMEOUT):
                if self.browser_context and close_browser_on_completion:
                    LOG.info("Closing browser context and its pages")
                    try:
                        await self.browser_context.close()
                    except Exception:
                        LOG.warning("Failed to close browser context", exc_info=True)
                    LOG.info("Main browser context and all its pages are closed")
                    if self.browser_cleanup is not None:
                        try:
                            await self.browser_cleanup()
                            LOG.info("Main browser cleanup is executed")
                        except Exception:
                            LOG.warning("Failed to execute browser cleanup", exc_info=True)
        except asyncio.TimeoutError:
            LOG.error("Timeout to close browser context, going to stop playwright directly")

        try:
            async with asyncio.timeout(BROWSER_CLOSE_TIMEOUT):
                if self.pw and close_browser_on_completion:
                    try:
                        LOG.info("Stopping playwright")
                        await self.pw.stop()
                        LOG.info("Playwright is stopped")
                    except Exception:
                        LOG.warning("Failed to stop playwright", exc_info=True)
        except asyncio.TimeoutError:
            LOG.error("Timeout to close playwright, might leave the broswer opening forever")

    async def take_fullpage_screenshot(
        self,
        file_path: str | None = None,
    ) -> bytes:
        page = await self.__assert_page()
        return await SkyvernFrame.take_scrolling_screenshot(
            page=page,
            file_path=file_path,
            mode=ScreenshotMode.LITE,
        )

    @traced(name="skyvern.browser.post_action_screenshot")
    async def take_post_action_screenshot(
        self,
        scrolling_number: int,
        file_path: str | None = None,
    ) -> bytes:
        page = await self.__assert_page()
        return await SkyvernFrame.take_scrolling_screenshot(
            page=page,
            file_path=file_path,
            mode=ScreenshotMode.LITE,
            scrolling_number=scrolling_number,
        )