Skyvern/skyvern/core/script_generations/skyvern_page.py

from __future__ import annotations

import asyncio
import copy
import json
from dataclasses import dataclass
from datetime import datetime, timezone
from enum import StrEnum
from typing import Any, Callable, Literal

from playwright.async_api import Page

from skyvern.config import settings
from skyvern.exceptions import WorkflowRunNotFound
from skyvern.forge import app
from skyvern.forge.prompts import prompt_engine
from skyvern.forge.sdk.api.files import download_file
from skyvern.forge.sdk.artifact.models import ArtifactType
from skyvern.forge.sdk.core import skyvern_context
from skyvern.utils.prompt_engine import load_prompt_with_elements
from skyvern.webeye.actions import handler_utils
from skyvern.webeye.actions.action_types import ActionType
from skyvern.webeye.actions.actions import Action, ActionStatus, ExtractAction, SelectOption
from skyvern.webeye.browser_factory import BrowserState
from skyvern.webeye.scraper.scraper import ScrapedPage, scrape_website


class Driver(StrEnum):
    PLAYWRIGHT = "playwright"


@dataclass
class ActionMetadata:
    intention: str = ""
    data: dict[str, Any] | str | None = None
    timestamp: float | None = None  # filled in by recorder
    screenshot_path: str | None = None  # if enabled


@dataclass
class ActionCall:
    name: ActionType
    args: tuple[Any, ...]
    kwargs: dict[str, Any]
    meta: ActionMetadata
    result: Any | None = None  # populated after execution
    error: Exception | None = None  # populated if failed


class SkyvernPage:
    """
    A minimal adapter around the chosen driver that:
    1. Executes real browser commands
    2. Records ActionCallobjects into RunContext.trace
    3. Adds retry / fallback hooks
    """

    def __init__(
        self,
        scraped_page: ScrapedPage,
        page: Page,
        *,
        recorder: Callable[[ActionCall], None] | None = None,
        # generate_response: bool = False,
    ):
        self.scraped_page = scraped_page
        self.page = page
        self._record = recorder or (lambda ac: None)

    @classmethod
    async def _get_or_create_browser_state(cls) -> BrowserState:
        context = skyvern_context.current()
        if context and context.workflow_run_id and context.organization_id:
            workflow_run = await app.DATABASE.get_workflow_run(
                workflow_run_id=context.workflow_run_id, organization_id=context.organization_id
            )
            if workflow_run:
                browser_state = await app.BROWSER_MANAGER.get_or_create_for_workflow_run(
                    workflow_run=workflow_run, browser_session_id=None
                )
            else:
                raise WorkflowRunNotFound(workflow_run_id=context.workflow_run_id)
        else:
            browser_state = await app.BROWSER_MANAGER.get_or_create_for_script()
        return browser_state

    @classmethod
    async def _get_browser_state(cls) -> BrowserState | None:
        context = skyvern_context.current()
        if context and context.workflow_run_id and context.organization_id:
            workflow_run = await app.DATABASE.get_workflow_run(
                workflow_run_id=context.workflow_run_id, organization_id=context.organization_id
            )
            if workflow_run:
                browser_state = app.BROWSER_MANAGER.get_for_workflow_run(workflow_run_id=context.workflow_run_id)
            else:
                raise WorkflowRunNotFound(workflow_run_id=context.workflow_run_id)
        else:
            browser_state = app.BROWSER_MANAGER.get_for_script()
        return browser_state

    @classmethod
    async def create(cls) -> SkyvernPage:
        # initialize browser state
        # TODO: add workflow_run_id or eventually script_id/script_run_id
        browser_state = await cls._get_or_create_browser_state()
        scraped_page = await scrape_website(
            browser_state=browser_state,
            url="",
            cleanup_element_tree=app.AGENT_FUNCTION.cleanup_element_tree_factory(),
            scrape_exclude=app.scrape_exclude,
            max_screenshot_number=settings.MAX_NUM_SCREENSHOTS,
            draw_boxes=True,
            scroll=True,
            support_empty_page=True,
        )
        page = await scraped_page._browser_state.must_get_working_page()
        return cls(scraped_page=scraped_page, page=page)

    @staticmethod
    def action_wrap(
        action: ActionType,
    ) -> Callable:
        """
        Decorator to record the action call.

        Auto-creates action records in DB before action execution
        and screenshot artifacts after action execution.
        """

        def decorator(fn: Callable) -> Callable:
            async def wrapper(
                skyvern_page: SkyvernPage,
                *args: Any,
                intention: str = "",
                data: str | dict[str, Any] = "",
                **kwargs: Any,
            ) -> Any:
                meta = ActionMetadata(intention, data)
                call = ActionCall(action, args, kwargs, meta)

                action_status = ActionStatus.completed

                try:
                    call.result = await fn(
                        skyvern_page, *args, intention=intention, data=data, **kwargs
                    )  # real driver call

                    # Note: Action status would be updated to completed here if update method existed

                    return call.result
                except Exception as e:
                    call.error = e
                    action_status = ActionStatus.failed
                    # Note: Action status would be updated to failed here if update method existed

                    # LLM fallback hook could go here ...
                    raise
                finally:
                    skyvern_page._record(call)
                    # Auto-create action after execution
                    await skyvern_page._create_action_before_execution(
                        action_type=action,
                        intention=intention,
                        status=action_status,
                        data=data,
                        kwargs=kwargs,
                    )

                    # Auto-create screenshot artifact after execution
                    await skyvern_page._create_screenshot_after_execution()

            return wrapper

        return decorator

    async def goto(self, url: str, timeout: float = settings.BROWSER_LOADING_TIMEOUT_MS) -> None:
        await self.page.goto(
            url,
            timeout=timeout,
        )

    async def _create_action_before_execution(
        self,
        action_type: ActionType,
        intention: str = "",
        status: ActionStatus = ActionStatus.pending,
        data: str | dict[str, Any] = "",
        kwargs: dict[str, Any] | None = None,
    ) -> Action | None:
        """Create an action record in the database before execution if task_id and step_id are available."""
        try:
            context = skyvern_context.current()
            if not context or not context.task_id or not context.step_id:
                return None

            # Create action record. TODO: store more action fields
            kwargs = kwargs or {}
            text = kwargs.get("text")
            option_value = kwargs.get("option")
            select_option = SelectOption(value=option_value) if option_value else None
            response: str | None = kwargs.get("response")
            if not response:
                if action_type == ActionType.INPUT_TEXT:
                    response = text
                elif action_type == ActionType.SELECT_OPTION:
                    if select_option:
                        response = select_option.value

            action = Action(
                element_id="",
                action_type=action_type,
                status=status,
                organization_id=context.organization_id,
                workflow_run_id=context.workflow_run_id,
                task_id=context.task_id,
                step_id=context.step_id,
                step_order=0,  # Will be updated by the system if needed
                action_order=0,  # Will be updated by the system if needed
                intention=intention,
                reasoning=f"Auto-generated action for {action_type.value}",
                text=text,
                option=select_option,
                response=response,
                created_by="script",
            )
            if action_type == ActionType.EXTRACT:
                action = ExtractAction(
                    element_id="",
                    action_type=action_type,
                    status=status,
                    organization_id=context.organization_id,
                    workflow_run_id=context.workflow_run_id,
                    task_id=context.task_id,
                    step_id=context.step_id,
                    step_order=0,
                    action_order=0,
                    intention=intention,
                    reasoning=f"Auto-generated action for {action_type.value}",
                    data_extraction_goal=kwargs.get("prompt"),
                    data_extraction_schema=kwargs.get("schema"),
                    option=select_option,
                    response=response,
                    created_by="script",
                )

            created_action = await app.DATABASE.create_action(action)
            return created_action

        except Exception:
            # If action creation fails, don't block the actual action execution
            return None

    @classmethod
    async def _create_screenshot_after_execution(cls) -> None:
        """Create a screenshot artifact after action execution if task_id and step_id are available."""
        try:
            context = skyvern_context.ensure_context()
            if not context or not context.task_id or not context.step_id:
                return

            # Get browser state and take screenshot
            browser_state = await cls._get_browser_state()
            if not browser_state:
                return

            screenshot = await browser_state.take_post_action_screenshot(scrolling_number=0)

            if screenshot:
                # Create a minimal Step object for artifact creation
                step = await app.DATABASE.get_step(
                    context.task_id, context.step_id, organization_id=context.organization_id
                )
                if not step:
                    return

                await app.ARTIFACT_MANAGER.create_artifact(
                    step=step,
                    artifact_type=ArtifactType.SCREENSHOT_ACTION,
                    data=screenshot,
                )

        except Exception:
            # If screenshot creation fails, don't block execution
            pass

    ######### Public Interfaces #########
    @action_wrap(ActionType.CLICK)
    async def click(self, xpath: str, intention: str | None = None, data: str | dict[str, Any] | None = None) -> None:
        """Click an element identified by ``xpath``.

        When ``intention`` and ``data`` are provided a new click action is
        generated via the ``single-click-action`` prompt.  The model returns a
        fresh xpath based on the current DOM and the updated data for this run.
        The browser then clicks the element using this newly generated xpath.

        If the prompt generation or parsing fails for any reason we fall back to
        clicking the originally supplied ``xpath``.
        """
        new_xpath = xpath

        if intention and data:
            try:
                # Build the element tree of the current page for the prompt
                context = skyvern_context.ensure_context()
                payload_str = json.dumps(data) if isinstance(data, (dict, list)) else (data or "")
                refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots()
                element_tree = refreshed_page.build_element_tree()
                single_click_prompt = prompt_engine.load_prompt(
                    template="single-click-action",
                    navigation_goal=intention,
                    navigation_payload_str=payload_str,
                    current_url=self.page.url,
                    elements=element_tree,
                    local_datetime=datetime.now(context.tz_info or datetime.now().astimezone().tzinfo).isoformat(),
                    user_context=getattr(context, "prompt", None),
                )
                json_response = await app.SINGLE_CLICK_AGENT_LLM_API_HANDLER(
                    prompt=single_click_prompt,
                    prompt_name="single-click-action",
                )
                actions = json_response.get("actions", [])
                if actions:
                    new_xpath = actions[0].get("xpath", xpath) or xpath
            except Exception:
                # If anything goes wrong, fall back to the original xpath
                new_xpath = xpath

        locator = self.page.locator(f"xpath={new_xpath}")
        await locator.click(timeout=5000)

    @action_wrap(ActionType.INPUT_TEXT)
    async def fill(
        self,
        xpath: str,
        text: str,
        intention: str | None = None,
        data: str | dict[str, Any] | None = None,
        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
    ) -> None:
        await self._input_text(xpath, text, intention, data, timeout)

    @action_wrap(ActionType.INPUT_TEXT)
    async def type(
        self,
        xpath: str,
        text: str,
        intention: str | None = None,
        data: str | dict[str, Any] | None = None,
        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
    ) -> None:
        await self._input_text(xpath, text, intention, data, timeout)

    async def _input_text(
        self,
        xpath: str,
        text: str,
        intention: str | None = None,
        data: str | dict[str, Any] | None = None,
        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
    ) -> None:
        """Input text into an element identified by ``xpath``.

        When ``intention`` and ``data`` are provided a new input text action is
        generated via the `script-generation-input-text-generatiion` prompt.  The model returns a
        fresh text based on the current DOM and the updated data for this run.
        The browser then inputs the text using this newly generated text.

        If the prompt generation or parsing fails for any reason we fall back to
        inputting the originally supplied ``text``.
        """
        # format the text with the actual value of the parameter if it's a secret when running a workflow
        context = skyvern_context.current()
        if context and context.workflow_run_id:
            text = await _get_actual_value_of_parameter_if_secret(context.workflow_run_id, text)

        locator = self.page.locator(f"xpath={xpath}")
        await handler_utils.input_sequentially(locator, text, timeout=timeout)

    @action_wrap(ActionType.UPLOAD_FILE)
    async def upload_file(
        self, xpath: str, file_path: str, intention: str | None = None, data: str | dict[str, Any] | None = None
    ) -> None:
        # if self.generate_response:
        #     # TODO: regenerate file_path and xpath
        #     pass
        file = await download_file(file_path)
        await self.page.set_input_files(xpath, file)

    @action_wrap(ActionType.SELECT_OPTION)
    async def select_option(
        self,
        xpath: str,
        option: str,
        intention: str | None = None,
        data: str | dict[str, Any] | None = None,
        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
    ) -> None:
        # if self.generate_response:
        #     # TODO: regenerate option
        #     pass
        locator = self.page.locator(f"xpath={xpath}")
        try:
            await locator.click(timeout=timeout)
        except Exception:
            print("Failed to click before select action")
            return
        await locator.select_option(option, timeout=timeout)

    @action_wrap(ActionType.WAIT)
    async def wait(
        self, seconds: float, intention: str | None = None, data: str | dict[str, Any] | None = None
    ) -> None:
        await asyncio.sleep(seconds)

    @action_wrap(ActionType.NULL_ACTION)
    async def null_action(self, intention: str | None = None, data: str | dict[str, Any] | None = None) -> None:
        return

    @action_wrap(ActionType.SOLVE_CAPTCHA)
    async def solve_captcha(
        self, xpath: str, intention: str | None = None, data: str | dict[str, Any] | None = None
    ) -> None:
        await asyncio.sleep(30)

    @action_wrap(ActionType.TERMINATE)
    async def terminate(
        self, errors: list[str], intention: str | None = None, data: str | dict[str, Any] | None = None
    ) -> None:
        # TODO: update the workflow run status to terminated
        return

    @action_wrap(ActionType.COMPLETE)
    async def complete(
        self, data_extraction_goal: str, intention: str | None = None, data: str | dict[str, Any] | None = None
    ) -> None:
        # TODO: update the workflow run status to completed
        return

    @action_wrap(ActionType.RELOAD_PAGE)
    async def reload_page(self, intention: str | None = None, data: str | dict[str, Any] | None = None) -> None:
        await self.page.reload()
        return

    @action_wrap(ActionType.EXTRACT)
    async def extract(
        self,
        prompt: str,
        schema: dict[str, Any] | list | str | None = None,
        error_code_mapping: dict[str, str] | None = None,
        intention: str | None = None,
        data: str | dict[str, Any] | None = None,
    ) -> dict[str, Any] | list | str | None:
        scraped_page_refreshed = await self.scraped_page.refresh()
        context = skyvern_context.current()
        tz_info = datetime.now(tz=timezone.utc).tzinfo
        if context and context.tz_info:
            tz_info = context.tz_info
        extract_information_prompt = load_prompt_with_elements(
            element_tree_builder=scraped_page_refreshed,
            prompt_engine=prompt_engine,
            template_name="extract-information",
            html_need_skyvern_attrs=False,
            data_extraction_goal=prompt,
            extracted_information_schema=schema,
            current_url=scraped_page_refreshed.url,
            extracted_text=scraped_page_refreshed.extracted_text,
            error_code_mapping_str=(json.dumps(error_code_mapping) if error_code_mapping else None),
            local_datetime=datetime.now(tz_info).isoformat(),
        )
        step = None
        if context and context.organization_id and context.task_id and context.step_id:
            step = await app.DATABASE.get_step(
                task_id=context.task_id, step_id=context.step_id, organization_id=context.organization_id
            )

        result = await app.EXTRACTION_LLM_API_HANDLER(
            prompt=extract_information_prompt,
            step=step,
            screenshots=scraped_page_refreshed.screenshots,
            prompt_name="extract-information",
        )
        return result

    @action_wrap(ActionType.VERIFICATION_CODE)
    async def verification_code(
        self, xpath: str, intention: str | None = None, data: str | dict[str, Any] | None = None
    ) -> None:
        return

    @action_wrap(ActionType.SCROLL)
    async def scroll(
        self, scroll_x: int, scroll_y: int, intention: str | None = None, data: str | dict[str, Any] | None = None
    ) -> None:
        await self.page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})")

    @action_wrap(ActionType.KEYPRESS)
    async def keypress(
        self,
        keys: list[str],
        hold: bool = False,
        duration: float = 0,
        intention: str | None = None,
        data: str | dict[str, Any] | None = None,
    ) -> None:
        await handler_utils.keypress(self.page, keys, hold=hold, duration=duration)

    @action_wrap(ActionType.MOVE)
    async def move(
        self, x: int, y: int, intention: str | None = None, data: str | dict[str, Any] | None = None
    ) -> None:
        await self.page.mouse.move(x, y)

    @action_wrap(ActionType.DRAG)
    async def drag(
        self,
        start_x: int,
        start_y: int,
        path: list[tuple[int, int]],
        intention: str | None = None,
        data: str | dict[str, Any] | None = None,
    ) -> None:
        await handler_utils.drag(self.page, start_x, start_y, path)

    @action_wrap(ActionType.LEFT_MOUSE)
    async def left_mouse(
        self,
        x: int,
        y: int,
        direction: Literal["down", "up"],
        intention: str | None = None,
        data: str | dict[str, Any] | None = None,
    ) -> None:
        await handler_utils.left_mouse(self.page, x, y, direction)


class RunContext:
    def __init__(
        self, parameters: dict[str, Any], page: SkyvernPage, generated_parameters: dict[str, Any] | None = None
    ) -> None:
        self.original_parameters = parameters
        self.generated_parameters = generated_parameters
        self.parameters = copy.deepcopy(parameters)
        # if generated_parameters:
        #     self.parameters.update(generated_parameters)
        self.page = page
        self.trace: list[ActionCall] = []
        self.prompt: str | None = None


async def _get_actual_value_of_parameter_if_secret(workflow_run_id: str, parameter: str) -> Any:
    """
    Get the actual value of a parameter if it's a secret. If it's not a secret, return the parameter value as is.

    Just return the parameter value if the task isn't a workflow's task.

    This is only used for InputTextAction, UploadFileAction, and ClickAction (if it has a file_url).
    """
    workflow_run_context = app.WORKFLOW_CONTEXT_MANAGER.get_workflow_run_context(workflow_run_id)
    secret_value = workflow_run_context.get_original_secret_value_or_none(parameter)
    return secret_value if secret_value is not None else parameter