Skyvern/skyvern/forge/sdk/copilot/tracing_setup.py

"""Tracing helpers for the workflow copilot OpenAI Agents SDK integration.

Tracing is dev-only and opt-in. See ``cloud_docs/local-dev/copilot-tracing.md``
for how to enable it without adding logfire to the project lockfile.
"""

from __future__ import annotations

import contextlib
import contextvars
import json
import os
import threading
from typing import Any

import structlog

# Reuse the HTTP-logging redactor so trace-side and SSE-side redaction share
# one exact-match sensitive-key policy.
from skyvern.forge.request_logging import redact_sensitive_fields
from skyvern.forge.sdk.core import skyvern_context

LOG = structlog.get_logger()

_TRACING_ENABLED_VALUES = frozenset({"1", "true", "yes"})
_TRACING_INIT_LOCK = threading.Lock()
_TRACING_INITIALIZED = False
# Set the first time the per-span rename patch fails so the warning fires
# once per process. threading.Event.set() / is_set() are race-free without
# a hot-path lock -- just as cheap as a bool read in the happy (already-set)
# case.
_SPAN_RENAME_WARNED = threading.Event()

# Logfire private-internals path the patch reaches into. Kept here so it is
# easy to audit when a logfire upgrade surfaces the rename-warning.
_LOGFIRE_PRIVATE_MODULE = "logfire._internal.integrations.openai_agents"
_LOGFIRE_PATCH_SYMBOLS = ("attributes_from_span_data", "LogfireTraceProviderWrapper")

# Set by agent.py before running the agent so the span patch can read it.
_copilot_model_name: contextvars.ContextVar[str | None] = contextvars.ContextVar("_copilot_model_name", default=None)


def is_tracing_enabled() -> bool:
    """Check COPILOT_TRACING_ENABLED env var (1/true/yes)."""
    value = os.getenv("COPILOT_TRACING_ENABLED", "")
    return value.strip().lower() in _TRACING_ENABLED_VALUES


def _clear_sdk_trace_processors() -> None:
    """Clear the SDK's built-in trace processors to prevent uploads to OpenAI."""
    try:
        from agents import set_trace_processors

        set_trace_processors([])
    except ImportError:
        pass


def ensure_tracing_initialized() -> None:
    """Initialize Agents SDK tracing processors once.

    When ``COPILOT_TRACING_ENABLED`` is set, logfire is configured as the
    trace exporter.  Otherwise the SDK's built-in tracing to OpenAI servers
    is disabled to avoid 403/401 errors.
    """
    global _TRACING_INITIALIZED

    if _TRACING_INITIALIZED:
        return

    with _TRACING_INIT_LOCK:
        if _TRACING_INITIALIZED:
            return

        if not is_tracing_enabled():
            # Disable the SDK's built-in tracing to OpenAI servers.
            # Without this, the SDK attempts to upload traces and fails with
            # 403 for zero-data-retention orgs or 401 without an OpenAI key.
            try:
                from agents import set_tracing_disabled

                set_tracing_disabled(True)
            except ImportError:
                pass
            _TRACING_INITIALIZED = True
            return

        try:
            import logfire
        except ModuleNotFoundError:
            LOG.warning("Copilot tracing requested but logfire is not installed")
            _clear_sdk_trace_processors()
            _TRACING_INITIALIZED = True
            return

        logfire.configure(send_to_logfire="if-token-present", service_name="skyvern-copilot")
        logfire.instrument_openai_agents()
        _patch_agent_span_attributes()
        # Logfire instruments via OpenTelemetry independently of the SDK's
        # built-in trace processors.  Clear the default OpenAI exporter so it
        # doesn't attempt to send traces (fails with 403 for ZDR orgs).
        _clear_sdk_trace_processors()
        _TRACING_INITIALIZED = True
        LOG.info("Initialized copilot tracing", exporter="logfire")


def _usage_field(obj: Any, *keys: str) -> Any:
    """Return the first present field by key on a usage object/dict."""
    if obj is None:
        return None
    for key in keys:
        if isinstance(obj, dict):
            if key in obj:
                return obj[key]
        else:
            value = getattr(obj, key, None)
            if value is not None:
                return value
    return None


def _attach_cost_attr(attrs: dict[str, Any], usage: Any, model: str | None) -> None:
    """Stamp ``operation.cost`` (USD) on a GenerationSpanData span.

    Matches the attribute Logfire's native ``instrument_openai`` / ``instrument_anthropic``
    integrations emit (via ``genai_prices``). Logfire's AI Agents dashboard keys cost
    off this attribute. Silent on any failure so telemetry cannot break the copilot path.
    """
    if not model or usage is None:
        return
    input_tokens = _usage_field(usage, "input_tokens", "prompt_tokens")
    output_tokens = _usage_field(usage, "output_tokens", "completion_tokens")
    if not isinstance(input_tokens, (int, float)) or not isinstance(output_tokens, (int, float)):
        return
    # OpenAI reports cached prompt tokens nested under input_tokens_details; pull
    # them out so cost reflects the cache discount instead of full input pricing.
    cached = _usage_field(
        _usage_field(usage, "input_tokens_details", "prompt_tokens_details"),
        "cached_tokens",
    )
    cache_read = int(cached) if isinstance(cached, (int, float)) and cached > 0 else None
    try:
        from genai_prices import Usage, calc_price

        price = calc_price(
            Usage(
                input_tokens=int(input_tokens),
                output_tokens=int(output_tokens),
                cache_read_tokens=cache_read,
            ),
            model_ref=model,
            provider_id="openai",
        )
    except Exception:
        return
    attrs["operation.cost"] = float(price.total_price)


def _patch_agent_span_attributes() -> None:
    """Patch logfire to emit OTel GenAI semantic convention attributes on agent spans.

    Logfire's OpenAI agents integration puts the agent name under a plain
    ``name`` attribute but the Agents dashboard requires the OTel semantic
    convention attributes per:

    * https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-agent-spans/
    * https://opentelemetry.io/docs/specs/semconv/gen-ai/openai/

    Required for ``invoke_agent``:
    * ``gen_ai.agent.name``
    * ``gen_ai.operation.name`` = ``invoke_agent``
    * ``gen_ai.provider.name`` = ``openai``
    * ``gen_ai.request.model`` (read from ``_copilot_model_name`` context var)

    Also patches the OTel span name from logfire's default
    ``Agent run: {name!r}`` to the OTel convention
    ``invoke_agent {agent_name}`` so the Logfire Agents dashboard can
    discover the agent.

    TODO: remove once logfire natively emits these on AgentSpanData spans.
    """
    try:
        import logfire._internal.integrations.openai_agents as _oai_mod
        from agents import AgentSpanData, FunctionSpanData, GenerationSpanData

        # Patch 1: Add OTel GenAI semconv attributes that logfire doesn't set.
        # - AgentSpanData: gen_ai.agent.name, gen_ai.operation.name, etc.
        # - GenerationSpanData: gen_ai.operation.name = "chat"
        # - FunctionSpanData: gen_ai.operation.name = "execute_tool"
        _original = _oai_mod.attributes_from_span_data

        def _patched(span_data: Any, msg_template: str) -> dict[str, Any]:
            attrs = _original(span_data, msg_template)
            if isinstance(span_data, AgentSpanData) and "name" in attrs:
                attrs["gen_ai.agent.name"] = attrs["name"]
                attrs["gen_ai.operation.name"] = "invoke_agent"
                attrs["gen_ai.provider.name"] = "openai"
                model = _copilot_model_name.get()
                if model:
                    attrs["gen_ai.request.model"] = model
            elif isinstance(span_data, GenerationSpanData):
                attrs.setdefault("gen_ai.operation.name", "chat")
                _attach_cost_attr(attrs, getattr(span_data, "usage", None), getattr(span_data, "model", None))
            elif isinstance(span_data, FunctionSpanData):
                attrs.setdefault("gen_ai.operation.name", "execute_tool")
                if "name" in attrs:
                    attrs.setdefault("gen_ai.tool.name", attrs["name"])
                # Redact sensitive values in the tool-call arguments before
                # the span is emitted. The `input` attribute is a JSON string
                # serialized by the agents SDK; parse it, apply the shared
                # exact-match redactor, and re-serialize. Only FunctionSpanData
                # is covered here; GenerationSpanData and session history still
                # carry raw values (see follow-up in the copilot eval plan).
                raw_input = attrs.get("input")
                if isinstance(raw_input, str):
                    try:
                        parsed = json.loads(raw_input)
                    except (json.JSONDecodeError, TypeError, ValueError):
                        parsed = None
                    if parsed is not None:
                        try:
                            attrs["input"] = json.dumps(redact_sensitive_fields(parsed))
                        except (TypeError, ValueError) as exc:
                            # Fail closed: if redaction or re-serialization raises, drop
                            # the raw value rather than emitting unredacted input to the
                            # trace backend.
                            attrs["input"] = "[redacted: serialization error]"
                            LOG.warning("Copilot tool-call input redaction failed", error=repr(exc))
            ctx = skyvern_context.current()
            if ctx is not None and ctx.copilot_session_id is not None:
                if isinstance(span_data, (AgentSpanData, GenerationSpanData, FunctionSpanData)):
                    attrs["copilot.session_id"] = ctx.copilot_session_id
            return attrs

        _oai_mod.attributes_from_span_data = _patched

        # Patch 2: Override the OTel span name for AgentSpanData to match the
        # OTel GenAI semantic convention pattern ``invoke_agent {agent_name}``.
        # Logfire uses ``Agent run: {name!r}`` as both the msg_template and the
        # OTel span name; the Agents dashboard looks for the convention name.
        _wrapper_cls = _oai_mod.LogfireTraceProviderWrapper
        _original_create = _wrapper_cls.create_span

        def _patched_create(
            self: Any,
            span_data: Any,
            span_id: str | None = None,
            parent: Any | None = None,
            disabled: bool = False,
        ) -> Any:
            result = _original_create(self, span_data, span_id, parent, disabled)
            if isinstance(span_data, AgentSpanData) and getattr(span_data, "name", None):
                try:
                    logfire_span = result.span_helper.span
                    logfire_span._span_name = f"invoke_agent {span_data.name}"
                except AttributeError as exc:
                    _warn_span_rename_once(exc)
            return result

        _wrapper_cls.create_span = _patched_create
    except (ImportError, AttributeError) as exc:
        # ImportError: logfire's private integration module moved or agents
        # dropped a SpanData subtype. AttributeError: a symbol we reach into
        # was renamed. Either way, degrade gracefully -- the tracing substrate
        # still works, just without the Agents-dashboard semconv attributes.
        LOG.warning(
            "Failed to patch agent span attributes for Logfire Agents dashboard",
            error=repr(exc),
            logfire_private_module=_LOGFIRE_PRIVATE_MODULE,
            expected_symbols=_LOGFIRE_PATCH_SYMBOLS,
        )


def _warn_span_rename_once(exc: BaseException) -> None:
    """Log the span-rename failure at most once per process.

    The failure mode for a logfire private-API rename is systematic, so logging
    per-span would flood the agent runtime with identical warnings.

    ``threading.Event`` provides a race-free once-only gate: ``is_set()`` is a
    cheap read on the hot path, and ``set()`` is internally synchronized.
    """
    if _SPAN_RENAME_WARNED.is_set():
        return
    # A rare double-set under contention is cheaper than serializing every
    # span-rename hit through a lock; Event.set() is idempotent.
    _SPAN_RENAME_WARNED.set()
    LOG.warning(
        "Failed to rename agent span for Logfire Agents dashboard",
        error=repr(exc),
        logfire_private_attribute="span_helper.span._span_name",
    )


def copilot_span(name: str, data: dict[str, Any] | None = None) -> Any:
    """Return a tracing span context manager, or nullcontext() when tracing is off.

    When tracing is on, callers must be inside an active ``agents.tracing.trace()``
    scope. ``custom_span`` outside of a live trace returns a NoOpSpan and emits an
    error log from the openai-agents package.
    """
    if not is_tracing_enabled():
        return contextlib.nullcontext()

    ensure_tracing_initialized()

    try:
        from agents.tracing import custom_span
    except ModuleNotFoundError as e:
        if e.name and e.name.startswith("agents"):
            return contextlib.nullcontext()
        raise

    return custom_span(name, data=data)