Skyvern/skyvern/webeye/browser_factory.py

from __future__ import annotations

import asyncio
import os
import pathlib
import platform
import random
import re
import shutil
import socket
import subprocess
import time
import uuid
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any, Awaitable, Callable, Protocol, cast
from urllib.parse import parse_qsl, urlparse

import psutil
import structlog
from playwright.async_api import Browser, BrowserContext, ConsoleMessage, Download, Page, Playwright

from skyvern.config import settings
from skyvern.constants import (
    BROWSER_DOWNLOAD_TIMEOUT,
    SKYVERN_DIR,
)
from skyvern.exceptions import UnknownBrowserType, UnknownErrorWhileCreatingBrowserContext
from skyvern.forge import app
from skyvern.forge.sdk.api.files import get_download_dir, make_temp_directory
from skyvern.forge.sdk.core.skyvern_context import current, ensure_context
from skyvern.schemas.runs import ProxyLocation, get_tzinfo_from_proxy
from skyvern.webeye.browser_artifacts import BrowserArtifacts, VideoArtifact
from skyvern.webeye.cdp_download_interceptor import CDPDownloadInterceptor

LOG = structlog.get_logger()


BrowserCleanupFunc = Callable[[], Awaitable[None]] | None
# Header to signal fresh browser context creation (stripped before sending to websites)
# When set to "true", creates a new incognito-like context instead of reusing existing ones
FRESH_CONTEXT_HEADER = "X-Skyvern-Fresh-Context"


@dataclass
class ParsedBrowserHeaders:
    """Result of parsing extra HTTP headers for internal Skyvern headers."""

    headers: dict[str, str]  # Headers to pass to browser (internal headers stripped)
    use_fresh_context: bool  # Whether to create a fresh browser context
    enable_download: bool  # Whether download interception is enabled


def parse_extra_headers(extra_http_headers: dict[str, str] | None) -> ParsedBrowserHeaders:
    """Parse extra HTTP headers and extract internal Skyvern headers.

    Extracts internal headers (case-insensitive) and returns a copy of headers
    without them, along with the extracted values.

    Args:
        extra_http_headers: Original headers dict (not mutated)

    Returns:
        ParsedBrowserHeaders with stripped headers and extracted values
    """
    headers: dict[str, str] = {}
    use_fresh_context = False
    enable_download = False

    if extra_http_headers:
        for key, value in extra_http_headers.items():
            key_lower = key.lower()
            if key_lower == FRESH_CONTEXT_HEADER.lower():
                use_fresh_context = value.lower() == "true"
            elif key_lower == "enable_download":
                enable_download = bool(value)
            else:
                headers[key] = value

    return ParsedBrowserHeaders(
        headers=headers,
        use_fresh_context=use_fresh_context,
        enable_download=enable_download,
    )


def set_browser_console_log(browser_context: BrowserContext, browser_artifacts: BrowserArtifacts) -> None:
    if browser_artifacts.browser_console_log_path is None:
        log_path = f"{settings.LOG_PATH}/{datetime.utcnow().strftime('%Y-%m-%d')}/{uuid.uuid4()}.log"
        try:
            os.makedirs(os.path.dirname(log_path), exist_ok=True)
            # create the empty log file
            with open(log_path, "w") as _:
                pass
        except Exception:
            LOG.warning(
                "Failed to create browser log file",
                log_path=log_path,
                exc_info=True,
            )
            return
        browser_artifacts.browser_console_log_path = log_path

    async def browser_console_log(msg: ConsoleMessage) -> None:
        current_time = datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%fZ")
        key_values = " ".join([f"{key}={value}" for key, value in msg.location.items()])
        format_log = f"{current_time}[{msg.type}]{msg.text} {key_values}\n"
        await browser_artifacts.append_browser_console_log(format_log)

    LOG.info("browser console log is saved", log_path=browser_artifacts.browser_console_log_path)
    browser_context.on("console", browser_console_log)


def set_download_file_listener(
    browser_context: BrowserContext, download_timeout: float | None = None, **kwargs: Any
) -> None:
    async def listen_to_download(download: Download) -> None:
        workflow_run_id = kwargs.get("workflow_run_id")
        task_id = kwargs.get("task_id")
        try:
            async with asyncio.timeout(download_timeout or BROWSER_DOWNLOAD_TIMEOUT):
                file_path = await download.path()
                if file_path.suffix:
                    return

                LOG.info(
                    "No file extensions, going to add file extension automatically",
                    workflow_run_id=workflow_run_id,
                    task_id=task_id,
                    suggested_filename=download.suggested_filename,
                    url=download.url,
                )
                suffix = Path(download.suggested_filename).suffix
                if suffix:
                    LOG.info(
                        "Add extension according to suggested filename",
                        workflow_run_id=workflow_run_id,
                        task_id=task_id,
                        filepath=str(file_path) + suffix,
                    )
                    file_path.rename(str(file_path) + suffix)
                    return

                parsed_url = urlparse(download.url)
                parsed_qs = parse_qsl(parsed_url.query)
                for key, value in parsed_qs:
                    if key.lower() == "filename":
                        suffix = Path(value).suffix
                        if suffix:
                            LOG.info(
                                "Add extension according to the parsed query params of download url",
                                workflow_run_id=workflow_run_id,
                                task_id=task_id,
                                filename=value,
                            )
                            file_path.rename(str(file_path) + suffix)
                            return

                suffix = Path(parsed_url.path).suffix
                if suffix:
                    LOG.info(
                        "Add extension according to download url path",
                        workflow_run_id=workflow_run_id,
                        task_id=task_id,
                        filepath=str(file_path) + suffix,
                    )
                    file_path.rename(str(file_path) + suffix)
                    return
                # TODO: maybe should try to parse it from URL response
        except asyncio.TimeoutError:
            LOG.error(
                "timeout to download file, going to cancel the download",
                workflow_run_id=workflow_run_id,
                task_id=task_id,
            )
            await download.cancel()

        except Exception:
            LOG.exception(
                "Failed to add file extension name to downloaded file",
                workflow_run_id=workflow_run_id,
                task_id=task_id,
            )

    def listen_to_new_page(page: Page) -> None:
        page.on("download", listen_to_download)

    browser_context.on("page", listen_to_new_page)


def initialize_download_dir() -> str:
    context = ensure_context()
    return get_download_dir(
        context.run_id if context and context.run_id else context.workflow_run_id or context.task_id
    )


async def _apply_download_behaviour(browser: Browser) -> None:
    context = ensure_context()
    download_dir = get_download_dir(
        context.run_id if context and context.run_id else context.workflow_run_id or context.task_id
    )
    cdp_session = await browser.new_browser_cdp_session()
    await cdp_session.send(
        "Browser.setDownloadBehavior",
        {
            "behavior": "allow",
            "downloadPath": download_dir,
        },
    )

    LOG.info("setDownloadBehavior applied", download_dir=download_dir)


class BrowserContextCreator(Protocol):
    def __call__(
        self, playwright: Playwright, proxy_location: ProxyLocation | None = None, **kwargs: dict[str, Any]
    ) -> Awaitable[tuple[BrowserContext, BrowserArtifacts, BrowserCleanupFunc]]: ...


class BrowserContextFactory:
    _creators: dict[str, BrowserContextCreator] = {}
    _validator: Callable[[Page], Awaitable[bool]] | None = None

    @staticmethod
    def get_subdir() -> str:
        curr_context = current()
        if curr_context and curr_context.task_id:
            return curr_context.task_id
        elif curr_context and curr_context.request_id:
            return curr_context.request_id
        return str(uuid.uuid4())

    @staticmethod
    def update_chromium_browser_preferences(
        user_data_dir: str,
        download_dir: str,
        preference_template_path: str | None = None,
    ) -> None:
        preference_dst_folder = f"{user_data_dir}/Default"
        os.makedirs(preference_dst_folder, exist_ok=True)

        preference_dst_file = f"{preference_dst_folder}/Preferences"
        preference_template = preference_template_path or f"{SKYVERN_DIR}/webeye/chromium_preferences.json"

        preference_file_content = ""
        with open(preference_template) as f:
            preference_file_content = f.read()
            preference_file_content = preference_file_content.replace("MASK_SAVEFILE_DEFAULT_DIRECTORY", download_dir)
            preference_file_content = preference_file_content.replace("MASK_DOWNLOAD_DEFAULT_DIRECTORY", download_dir)
        with open(preference_dst_file, "w") as f:
            f.write(preference_file_content)

    @staticmethod
    def build_browser_args(
        proxy_location: ProxyLocation | None = None,
        cdp_port: int | None = None,
        extra_http_headers: dict[str, str] | None = None,
    ) -> dict[str, Any]:
        video_dir = f"{settings.VIDEO_PATH}/{datetime.utcnow().strftime('%Y-%m-%d')}"
        har_dir = (
            f"{settings.HAR_PATH}/{datetime.utcnow().strftime('%Y-%m-%d')}/{BrowserContextFactory.get_subdir()}.har"
        )

        extension_paths = []
        if settings.EXTENSIONS and settings.EXTENSIONS_BASE_PATH:
            try:
                os.makedirs(settings.EXTENSIONS_BASE_PATH, exist_ok=True)

                extension_paths = [str(Path(settings.EXTENSIONS_BASE_PATH) / ext) for ext in settings.EXTENSIONS]
                LOG.info("Extensions paths constructed", extension_paths=extension_paths)
            except Exception as e:
                LOG.error("Error constructing extension paths", error=str(e))

        browser_args = [
            "--disable-blink-features=AutomationControlled",
            "--disk-cache-size=1",
            "--start-maximized",
            "--kiosk-printing",
        ]

        if cdp_port:
            browser_args.append(f"--remote-debugging-port={cdp_port}")

        if extension_paths:
            joined_paths = ",".join(extension_paths)
            browser_args.extend([f"--disable-extensions-except={joined_paths}", f"--load-extension={joined_paths}"])
            LOG.info("Extensions added to browser args", extensions=joined_paths)

        browser_args.extend(settings.BROWSER_ADDITIONAL_ARGS)
        args = {
            "color_scheme": "no-preference",
            "args": browser_args,
            "ignore_default_args": [
                "--enable-automation",
            ],
            "record_har_path": har_dir,
            "record_video_dir": video_dir,
            "viewport": {
                "width": settings.BROWSER_WIDTH,
                "height": settings.BROWSER_HEIGHT,
            },
            "extra_http_headers": extra_http_headers,
        }
        if settings.BROWSER_LOCALE:
            args["locale"] = settings.BROWSER_LOCALE

        if settings.ENABLE_PROXY:
            proxy_config = setup_proxy()
            if proxy_config:
                args["proxy"] = proxy_config

        if proxy_location:
            if tz_info := get_tzinfo_from_proxy(proxy_location=proxy_location):
                args["timezone_id"] = tz_info.key
        return args

    @staticmethod
    def build_browser_artifacts(
        video_artifacts: list[VideoArtifact] | None = None,
        har_path: str | None = None,
        traces_dir: str | None = None,
        browser_session_dir: str | None = None,
        browser_console_log_path: str | None = None,
    ) -> BrowserArtifacts:
        return BrowserArtifacts(
            video_artifacts=video_artifacts or [],
            har_path=har_path,
            traces_dir=traces_dir,
            browser_session_dir=browser_session_dir,
            browser_console_log_path=browser_console_log_path,
        )

    @classmethod
    def register_type(cls, browser_type: str, creator: BrowserContextCreator) -> None:
        cls._creators[browser_type] = creator

    @classmethod
    async def create_browser_context(
        cls, playwright: Playwright, **kwargs: Any
    ) -> tuple[BrowserContext, BrowserArtifacts, BrowserCleanupFunc]:
        browser_type = settings.BROWSER_TYPE
        browser_context: BrowserContext | None = None
        try:
            creator = cls._creators.get(browser_type)
            if not creator:
                raise UnknownBrowserType(browser_type)
            browser_context, browser_artifacts, cleanup_func = await creator(playwright, **kwargs)
            if settings.BROWSER_LOGS_ENABLED:
                set_browser_console_log(browser_context=browser_context, browser_artifacts=browser_artifacts)
            set_download_file_listener(browser_context=browser_context, **kwargs)

            proxy_location: ProxyLocation | None = kwargs.get("proxy_location")
            if proxy_location is not None:
                context = ensure_context()
                context.tz_info = get_tzinfo_from_proxy(proxy_location)

            return browser_context, browser_artifacts, cleanup_func
        except Exception as e:
            if browser_context is not None:
                # FIXME: sometimes it can't close the browser context?
                LOG.error("unexpected error happens after created browser context, going to close the context")
                await browser_context.close()

            if isinstance(e, UnknownBrowserType):
                raise e

            raise UnknownErrorWhileCreatingBrowserContext(browser_type, e) from e


def setup_proxy() -> dict | None:
    if not settings.HOSTED_PROXY_POOL or settings.HOSTED_PROXY_POOL.strip() == "":
        LOG.warning("No proxy server value found. Continuing without using proxy...")
        return None

    proxy_servers = [server.strip() for server in settings.HOSTED_PROXY_POOL.split(",") if server.strip()]

    if not proxy_servers:
        LOG.warning("Proxy pool contains only empty values. Continuing without proxy...")
        return None

    valid_proxies = []
    for proxy in proxy_servers:
        if _is_valid_proxy_url(proxy):
            valid_proxies.append(proxy)
        else:
            LOG.warning(f"Invalid proxy URL format: {proxy}")

    if not valid_proxies:
        LOG.warning("No valid proxy URLs found. Continuing without proxy...")
        return None

    try:
        proxy_server = random.choice(valid_proxies)
        proxy_creds = _get_proxy_server_creds(proxy_server)

        LOG.info("Found proxy server creds, using them...")

        return {
            "server": proxy_server,
            "username": proxy_creds.get("username", ""),
            "password": proxy_creds.get("password", ""),
        }
    except Exception as e:
        LOG.warning(f"Error setting up proxy: {e}. Continuing without proxy...")
        return None


def _is_valid_proxy_url(url: str) -> bool:
    PROXY_PATTERN = re.compile(r"^(http|https|socks5):\/\/([^:@]+(:[^@]*)?@)?[^\s:\/]+(:\d+)?$")
    try:
        parsed = urlparse(url)
        if not parsed.scheme or not parsed.netloc:
            return False
        return bool(PROXY_PATTERN.match(url))
    except Exception:
        return False


def _get_proxy_server_creds(proxy: str) -> dict:
    parsed_url = urlparse(proxy)
    if parsed_url.username and parsed_url.password:
        return {"username": parsed_url.username, "password": parsed_url.password}
    LOG.warning("No credentials found in the proxy URL.")
    return {}


def _is_display_server_error(error: Exception) -> bool:
    """Return True if the worker cannot initialize the browser display/graphics stack.

    These errors appear when a headed browser is launched on a worker node
    without a usable display/graphics environment. Retrying with a fresh profile
    will not help — the fix is environment-side (display/EGL/SwiftShader support)
    rather than profile-side.
    """
    error_str = str(error).lower()
    display_indicators = [
        "missing x server",
        "xserver running",
        "no display",
        "$display",
        "the platform failed to initialize",
        "no suitable egl configs found",
        "failed to get config for surface",
        "collectgraphicsinfo failed",
        "glcontext::createoffscreenglsurface failed",
        "exiting gpu process due to errors during initialization",
    ]
    return any(indicator in error_str for indicator in display_indicators)


def _is_browser_profile_corruption_error(error: Exception) -> bool:
    """Return True if the error is consistent with a corrupted or bloated browser profile.

    These errors appear when launch_persistent_context fails to start Chrome because
    the user_data_dir is in a bad state (corrupted files, oversized cache, lock files
    from a prior crash, etc.).  The error text comes from Playwright's CDP driver.
    """
    if _is_display_server_error(error):
        return False

    error_str = str(error).lower()
    corruption_indicators = [
        "connection closed while reading from the driver",
        "target closed",
        "browser has been closed",
        "failed to launch",
        "unable to open database",
    ]
    return any(indicator in error_str for indicator in corruption_indicators)


def _get_cdp_port(kwargs: dict) -> int | None:
    raw_cdp_port = kwargs.get("cdp_port")
    if isinstance(raw_cdp_port, (int, str)):
        try:
            return int(raw_cdp_port)
        except (ValueError, TypeError):
            return None
    return None


def _is_port_in_use(port: int) -> bool:
    """Check if a port is already in use."""
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        try:
            s.bind(("localhost", port))
            return False
        except OSError:
            return True


def _is_chrome_running() -> bool:
    """Check if Chrome is already running."""
    chrome_process_names = ["chrome", "google-chrome", "google chrome"]
    for proc in psutil.process_iter(["name"]):
        try:
            proc_name = proc.info["name"].lower()
            if proc_name == "chrome_crashpad_handler":
                continue
            if any(chrome_name in proc_name for chrome_name in chrome_process_names):
                return True
        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
            pass
    return False


async def _create_headless_chromium(
    playwright: Playwright,
    proxy_location: ProxyLocation | None = None,
    extra_http_headers: dict[str, str] | None = None,
    **kwargs: dict,
) -> tuple[BrowserContext, BrowserArtifacts, BrowserCleanupFunc]:
    if browser_address := kwargs.get("browser_address"):
        return await _connect_to_cdp_browser(
            playwright,
            remote_browser_url=str(browser_address),
            extra_http_headers=extra_http_headers,
            apply_download_behaviour=True,
        )

    # Check for browser_profile_id and load from storage if available
    browser_profile_id = cast(str | None, kwargs.get("browser_profile_id"))
    organization_id_for_profile = cast(str | None, kwargs.get("organization_id"))
    user_data_dir: str | None = None
    loaded_from_saved_profile = False

    if browser_profile_id and organization_id_for_profile:
        profile_dir = await app.STORAGE.retrieve_browser_profile(
            organization_id=organization_id_for_profile,
            profile_id=browser_profile_id,
        )
        if profile_dir:
            user_data_dir = profile_dir
            loaded_from_saved_profile = True
            LOG.info(
                "Using browser profile",
                browser_profile_id=browser_profile_id,
                profile_dir=profile_dir,
            )
        else:
            LOG.warning(
                "Browser profile not found, using temp directory",
                browser_profile_id=browser_profile_id,
                organization_id=organization_id_for_profile,
            )

    if not user_data_dir:
        user_data_dir = make_temp_directory(prefix="skyvern_browser_")

    download_dir = initialize_download_dir()
    BrowserContextFactory.update_chromium_browser_preferences(
        user_data_dir=user_data_dir,
        download_dir=download_dir,
    )
    cdp_port: int | None = _get_cdp_port(kwargs)
    browser_args = BrowserContextFactory.build_browser_args(
        proxy_location=proxy_location, cdp_port=cdp_port, extra_http_headers=extra_http_headers
    )
    browser_args.update(
        {
            "user_data_dir": user_data_dir,
            "downloads_path": download_dir,
        }
    )

    browser_artifacts = BrowserContextFactory.build_browser_artifacts(
        har_path=browser_args["record_har_path"],
        browser_session_dir=user_data_dir,
    )
    try:
        browser_context = await playwright.chromium.launch_persistent_context(**browser_args)
    except Exception as launch_error:
        if loaded_from_saved_profile and _is_browser_profile_corruption_error(launch_error):
            LOG.warning(
                "Browser launch failed with saved profile — profile may be corrupted, falling back to fresh profile",
                browser_profile_id=browser_profile_id,
                organization_id=organization_id_for_profile,
                error=str(launch_error),
            )
            fallback_dir = make_temp_directory(prefix="skyvern_browser_")
            BrowserContextFactory.update_chromium_browser_preferences(
                user_data_dir=fallback_dir,
                download_dir=download_dir,
            )
            browser_args["user_data_dir"] = fallback_dir
            browser_artifacts = BrowserContextFactory.build_browser_artifacts(
                har_path=browser_args["record_har_path"],
                browser_session_dir=fallback_dir,
            )
            browser_context = await playwright.chromium.launch_persistent_context(**browser_args)
        else:
            raise
    return browser_context, browser_artifacts, None


async def _create_headful_chromium(
    playwright: Playwright,
    proxy_location: ProxyLocation | None = None,
    extra_http_headers: dict[str, str] | None = None,
    **kwargs: dict,
) -> tuple[BrowserContext, BrowserArtifacts, BrowserCleanupFunc]:
    if browser_address := kwargs.get("browser_address"):
        return await _connect_to_cdp_browser(
            playwright,
            remote_browser_url=str(browser_address),
            extra_http_headers=extra_http_headers,
            apply_download_behaviour=True,
        )

    # Check for browser_profile_id and load from storage if available
    browser_profile_id = cast(str | None, kwargs.get("browser_profile_id"))
    organization_id_for_profile = cast(str | None, kwargs.get("organization_id"))
    user_data_dir: str | None = None
    loaded_from_saved_profile = False

    if browser_profile_id and organization_id_for_profile:
        profile_dir = await app.STORAGE.retrieve_browser_profile(
            organization_id=organization_id_for_profile,
            profile_id=browser_profile_id,
        )
        if profile_dir:
            user_data_dir = profile_dir
            loaded_from_saved_profile = True
            LOG.info(
                "Using browser profile",
                browser_profile_id=browser_profile_id,
                profile_dir=profile_dir,
            )
        else:
            LOG.warning(
                "Browser profile not found, using temp directory",
                browser_profile_id=browser_profile_id,
                organization_id=organization_id_for_profile,
            )

    if not user_data_dir:
        user_data_dir = make_temp_directory(prefix="skyvern_browser_")

    download_dir = initialize_download_dir()
    BrowserContextFactory.update_chromium_browser_preferences(
        user_data_dir=user_data_dir,
        download_dir=download_dir,
    )
    cdp_port: int | None = _get_cdp_port(kwargs)
    browser_args = BrowserContextFactory.build_browser_args(
        proxy_location=proxy_location, cdp_port=cdp_port, extra_http_headers=extra_http_headers
    )
    browser_args.update(
        {
            "user_data_dir": user_data_dir,
            "downloads_path": download_dir,
            "headless": False,
        }
    )
    browser_artifacts = BrowserContextFactory.build_browser_artifacts(
        har_path=browser_args["record_har_path"],
        browser_session_dir=user_data_dir,
    )
    try:
        browser_context = await playwright.chromium.launch_persistent_context(**browser_args)
    except Exception as launch_error:
        if loaded_from_saved_profile and _is_browser_profile_corruption_error(launch_error):
            LOG.warning(
                "Browser launch failed with saved profile — profile may be corrupted, falling back to fresh profile",
                browser_profile_id=browser_profile_id,
                organization_id=organization_id_for_profile,
                error=str(launch_error),
            )
            fallback_dir = make_temp_directory(prefix="skyvern_browser_")
            BrowserContextFactory.update_chromium_browser_preferences(
                user_data_dir=fallback_dir,
                download_dir=download_dir,
            )
            browser_args["user_data_dir"] = fallback_dir
            browser_artifacts = BrowserContextFactory.build_browser_artifacts(
                har_path=browser_args["record_har_path"],
                browser_session_dir=fallback_dir,
            )
            browser_context = await playwright.chromium.launch_persistent_context(**browser_args)
        else:
            raise
    return browser_context, browser_artifacts, None


def default_user_data_dir() -> pathlib.Path:
    p = platform.system()
    if p == "Darwin":
        return pathlib.Path("~/Library/Application Support/Google/Chrome").expanduser()
    if p == "Windows":
        return pathlib.Path(os.environ["LOCALAPPDATA"]) / "Google" / "Chrome" / "User Data"
    # Assume Linux/Unix
    return pathlib.Path("~/.config/google-chrome").expanduser()


def is_valid_chromium_user_data_dir(directory: str) -> bool:
    """Check if a directory is a valid Chromium user data directory.

    A valid Chromium user data directory should:
    1. Exist
    2. Not be empty
    3. Contain a 'Default' directory
    4. Have a 'Preferences' file in the 'Default' directory
    """
    if not os.path.exists(directory):
        return False

    default_dir = os.path.join(directory, "Default")
    preferences_file = os.path.join(default_dir, "Preferences")

    return os.path.isdir(directory) and os.path.isdir(default_dir) and os.path.isfile(preferences_file)


async def _create_cdp_connection_browser(
    playwright: Playwright,
    proxy_location: ProxyLocation | None = None,
    extra_http_headers: dict[str, str] | None = None,
    **kwargs: dict,
) -> tuple[BrowserContext, BrowserArtifacts, BrowserCleanupFunc]:
    if browser_address := kwargs.get("browser_address"):
        return await _connect_to_cdp_browser(
            playwright,
            remote_browser_url=str(browser_address),
            extra_http_headers=extra_http_headers,
            apply_download_behaviour=True,
        )

    browser_type = settings.BROWSER_TYPE
    browser_path = settings.CHROME_EXECUTABLE_PATH

    if browser_type == "cdp-connect" and browser_path:
        LOG.info("Local browser path is given. Connecting to local browser with CDP", browser_path=browser_path)
        # First check if the debugging port is running and can be used
        if not _is_port_in_use(9222):
            LOG.info("Port 9222 is not in use, starting Chrome", browser_path=browser_path)
            # Check if Chrome is already running
            if _is_chrome_running():
                raise Exception(
                    "Chrome is already running. Please close all Chrome instances before starting with remote debugging."
                )
            # check if ./tmp/user_data_dir exists and if it's a valid Chromium user data directory
            try:
                if os.path.exists("./tmp/user_data_dir") and not is_valid_chromium_user_data_dir("./tmp/user_data_dir"):
                    LOG.info("Removing invalid user data directory")
                    shutil.rmtree("./tmp/user_data_dir")
                    shutil.copytree(default_user_data_dir(), "./tmp/user_data_dir")
                elif not os.path.exists("./tmp/user_data_dir"):
                    LOG.info("Copying default user data directory")
                    shutil.copytree(default_user_data_dir(), "./tmp/user_data_dir")
                else:
                    LOG.info("User data directory is valid")
            except FileExistsError:
                # If directory exists, remove it first then copy
                shutil.rmtree("./tmp/user_data_dir")
                shutil.copytree(default_user_data_dir(), "./tmp/user_data_dir")
            browser_process = subprocess.Popen(
                [
                    browser_path,
                    "--remote-debugging-port=9222",
                    "--no-first-run",
                    "--no-default-browser-check",
                    "--user-data-dir=./tmp/user_data_dir",
                    "--remote-debugging-address=0.0.0.0",
                ],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
            # Add small delay to allow browser to start
            time.sleep(1)
            if browser_process.poll() is not None:
                raise Exception(f"Failed to open browser. browser_path: {browser_path}")
        else:
            LOG.info("Port 9222 is in use, using existing browser")

    return await _connect_to_cdp_browser(playwright, settings.BROWSER_REMOTE_DEBUGGING_URL, extra_http_headers)


async def _connect_to_cdp_browser(
    playwright: Playwright,
    remote_browser_url: str,
    extra_http_headers: dict[str, str] | None = None,
    apply_download_behaviour: bool = False,
) -> tuple[BrowserContext, BrowserArtifacts, BrowserCleanupFunc]:
    parsed_headers = parse_extra_headers(extra_http_headers)

    browser_args = BrowserContextFactory.build_browser_args(
        extra_http_headers=parsed_headers.headers if parsed_headers.headers else None
    )

    browser_artifacts = BrowserContextFactory.build_browser_artifacts(
        har_path=browser_args["record_har_path"],
    )

    LOG.info("Connecting browser CDP connection", remote_browser_url=remote_browser_url)
    browser = await playwright.chromium.connect_over_cdp(remote_browser_url)

    if apply_download_behaviour:
        await _apply_download_behaviour(browser)

    # Decide whether to create fresh context or reuse existing one
    contexts = browser.contexts
    browser_context = None

    if parsed_headers.use_fresh_context or not contexts:
        LOG.info(
            "Creating new browser context",
            fresh_context_requested=parsed_headers.use_fresh_context,
            existing_contexts=len(contexts),
        )
        browser_context = await browser.new_context(
            record_video_dir=browser_args["record_video_dir"],
            viewport=browser_args["viewport"],
            extra_http_headers=browser_args["extra_http_headers"],
        )
    else:
        LOG.info("Reusing existing browser context", existing_contexts=len(contexts))
        browser_context = contexts[0]

    # Enable CDPDownloadInterceptor when enable_download is set.
    # This captures downloads via the Fetch domain and saves them locally.
    if parsed_headers.enable_download:
        download_dir = initialize_download_dir()
        interceptor = CDPDownloadInterceptor(output_dir=download_dir)

        # Enable interception on all existing pages
        for page in browser_context.pages:
            try:
                await interceptor.enable_for_page(page)
            except Exception:
                LOG.warning("Failed to enable CDP intercept on page", page_url=page.url, exc_info=True)

        # Auto-enable interception on new pages (e.g., target="_blank" links)
        async def _on_new_page(page: Page) -> None:
            try:
                await interceptor.enable_for_page(page)
            except Exception:
                LOG.warning("Failed to enable CDP intercept on new page", page_url=page.url, exc_info=True)

        browser_context.on("page", lambda page: asyncio.ensure_future(_on_new_page(page)))
        LOG.info(
            "CDP download interceptor enabled",
            download_dir=download_dir,
            existing_page_count=len(browser_context.pages),
        )

    LOG.info(
        "Launched browser CDP connection",
        remote_browser_url=remote_browser_url,
    )
    return browser_context, browser_artifacts, None


BrowserContextFactory.register_type("chromium-headless", _create_headless_chromium)
BrowserContextFactory.register_type("chromium-headful", _create_headful_chromium)
BrowserContextFactory.register_type("cdp-connect", _create_cdp_connection_browser)