free-claude-code/providers/utils/think_parser.py

"""Think tag parser for extracting reasoning content from responses."""

import re
from dataclasses import dataclass
from typing import Optional, Tuple, Iterator, Any
from enum import Enum


class ContentType(Enum):
    """Type of content chunk."""

    TEXT = "text"
    THINKING = "thinking"


@dataclass
class ContentChunk:
    """A chunk of parsed content."""

    type: ContentType
    content: str


class ThinkTagParser:
    """
    Streaming parser for <think>...</think> tags.

    Handles partial tags at chunk boundaries by buffering.
    """

    OPEN_TAG = "<think>"
    CLOSE_TAG = "</think>"
    OPEN_TAG_LEN = 7
    CLOSE_TAG_LEN = 8

    def __init__(self):
        self._buffer: str = ""
        self._in_think_tag: bool = False

    @property
    def in_think_mode(self) -> bool:
        """Whether currently inside a think tag."""
        return self._in_think_tag

    def feed(self, content: str) -> Iterator[ContentChunk]:
        """
        Feed content and yield parsed chunks.

        Handles partial tags by buffering content near potential tag boundaries.
        """
        self._buffer += content

        while self._buffer:
            if not self._in_think_tag:
                chunk = self._parse_outside_think()
                if chunk:
                    yield chunk
                else:
                    break
            else:
                chunk = self._parse_inside_think()
                if chunk:
                    yield chunk
                else:
                    break

    def _parse_outside_think(self) -> Optional[ContentChunk]:
        """Parse content outside think tags."""
        think_start = self._buffer.find(self.OPEN_TAG)

        if think_start == -1:
            # No tag found - check for partial tag at end
            # We buffer any trailing '<' and subsequent characters that could be part of <think>
            last_bracket = self._buffer.rfind("<")
            if (
                last_bracket != -1
                and len(self._buffer) - last_bracket < self.OPEN_TAG_LEN
            ):
                # Check if the partial string could be the start of <think>
                potential_tag = self._buffer[last_bracket:]
                if self.OPEN_TAG.startswith(potential_tag):
                    emit = self._buffer[:last_bracket]
                    self._buffer = self._buffer[last_bracket:]
                    if emit:
                        return ContentChunk(ContentType.TEXT, emit)
                    return None

            # No partial tag found or it's irrelevant
            emit = self._buffer
            self._buffer = ""
            if emit:
                return ContentChunk(ContentType.TEXT, emit)
            return None
        else:
            # Found <think> tag
            pre_think = self._buffer[:think_start]
            self._buffer = self._buffer[think_start + self.OPEN_TAG_LEN :]
            self._in_think_tag = True
            if pre_think:
                return ContentChunk(ContentType.TEXT, pre_think)
            # Continue parsing inside think tag
            return self._parse_inside_think()

    def _parse_inside_think(self) -> Optional[ContentChunk]:
        """Parse content inside think tags."""
        think_end = self._buffer.find(self.CLOSE_TAG)

        if think_end == -1:
            # No closing tag - check for partial at end
            last_bracket = self._buffer.rfind("<")
            if (
                last_bracket != -1
                and len(self._buffer) - last_bracket < self.CLOSE_TAG_LEN
            ):
                # Check if the partial string could be the start of </think>
                potential_tag = self._buffer[last_bracket:]
                if self.CLOSE_TAG.startswith(potential_tag):
                    emit = self._buffer[:last_bracket]
                    self._buffer = self._buffer[last_bracket:]
                    if emit:
                        return ContentChunk(ContentType.THINKING, emit)
                    return None

            emit = self._buffer
            self._buffer = ""
            if emit:
                return ContentChunk(ContentType.THINKING, emit)
            return None
        else:
            # Found </think> tag
            thinking_content = self._buffer[:think_end]
            self._buffer = self._buffer[think_end + self.CLOSE_TAG_LEN :]
            self._in_think_tag = False
            if thinking_content:
                return ContentChunk(ContentType.THINKING, thinking_content)
            # Continue parsing outside think tag
            return self._parse_outside_think()

    def flush(self) -> Optional[ContentChunk]:
        """Flush any remaining buffered content."""
        if self._buffer:
            chunk_type = (
                ContentType.THINKING if self._in_think_tag else ContentType.TEXT
            )
            content = self._buffer
            self._buffer = ""
            return ContentChunk(chunk_type, content)
        return None

    def reset(self):
        """Reset parser state."""
        self._buffer = ""
        self._in_think_tag = False


def extract_think_content(text: str) -> Tuple[Optional[str], str]:
    """
    Extract thinking content from text (non-streaming).

    Returns: (thinking_content, remaining_text)
    """
    think_pattern = re.compile(r"<think>(.*?)</think>", re.DOTALL)
    matches = think_pattern.findall(text)

    if matches:
        thinking = "\n".join(matches)
        remaining = think_pattern.sub("", text).strip()
        return thinking, remaining

    return None, text


def extract_reasoning_from_delta(delta: Any) -> Optional[str]:
    """
    Extract reasoning content from an OpenAI delta object.

    Checks both 'reasoning_content' and 'reasoning_details' fields.
    """
    if isinstance(delta, dict):
        reasoning = delta.get("reasoning_content")
        if reasoning:
            return reasoning

        reasoning_details = delta.get("reasoning_details")
        if reasoning_details and isinstance(reasoning_details, list):
            return "".join(
                item.get("text", "")
                for item in reasoning_details
                if isinstance(item, dict)
            )

    return None