multi_agent_chat/middleware: real-graph regression test for partial-pause parallel routing

2026-05-17 03:59:27 +00:00 · 2026-05-14 09:47:24 +02:00 · 2026-05-14 09:47:24 +02:00 · 668b89927b
commit 668b89927b
parent 8e10f38f32
1 changed files with 254 additions and 0 deletions
--- a/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_partial_pause_routing.py
+++ b/surfsense_backend/tests/unit/agents/multi_agent_chat/middleware/checkpointed_subagent_middleware/test_parallel_partial_pause_routing.py
@ -0,0 +1,254 @@
+"""Real-graph contract: one parallel branch completes while a sibling pauses with HITL.
+
+The two existing parallel-routing tests
+(``test_parallel_resume_command_keying`` and
+``test_parallel_heterogeneous_decisions``) both pause **all** branches
+simultaneously. That's the easy case — every dispatched ``task`` call has a
+matching pending interrupt, and the routing helpers see a uniform shape.
+
+Production rarely matches that uniform shape. The orchestrator typically
+delegates "create a Linear ticket and summarize the user's recent activity":
+one branch needs HITL, the other returns its result and exits. At the pause
+moment::
+
+    state.values["messages"] += [ToolMessage(from-A)]   # A merged in
+    state.interrupts          = [Interrupt(value-from-B)]   # B alone is pending
+
+So ``len(state.interrupts) < num_dispatched_tasks``. The slicer and
+``build_lg_resume_map`` must:
+
+1. **Key off ``state.interrupts``, never off the originally dispatched tcids.**
+   A flat decisions list of length 1 must route only to B; if anything tries
+   to look up A in the resume map, langgraph rejects an unknown
+   ``Interrupt.id``.
+2. **Leave A's contributions intact across resume.** A's ToolMessage was
+   committed at the pause; resuming the paused branch must not re-run A nor
+   drop its message.
+3. **Drain the single pending interrupt.** Final ``state.interrupts`` is
+   empty regardless of whether sibling branches were paused.
+
+The langgraph semantics this test relies on were verified empirically in the
+exploratory probe before this test was authored.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Annotated
+
+import pytest
+from langchain.tools import ToolRuntime
+from langchain_core.messages import AIMessage, HumanMessage
+from langchain_core.runnables import RunnableConfig
+from langgraph.checkpoint.memory import InMemorySaver
+from langgraph.graph import END, START, StateGraph
+from langgraph.graph.message import add_messages
+from langgraph.types import Command, Send, interrupt
+from typing_extensions import TypedDict
+
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.resume_routing import (
+    build_lg_resume_map,
+    collect_pending_tool_calls,
+    slice_decisions_by_tool_call,
+)
+from app.agents.multi_agent_chat.middleware.main_agent.checkpointed_subagent_middleware.task_tool import (
+    build_task_tool_with_parent_config,
+)
+
+
+class _SubState(TypedDict, total=False):
+    messages: Annotated[list, add_messages]
+
+
+class _DispatchState(TypedDict, total=False):
+    messages: Annotated[list, add_messages]
+    tcid: str
+    desc: str
+    subtype: str
+
+
+_QUICK_MARKER = "quick-subagent-finished-without-pausing"
+
+
+def _build_quick_subagent(checkpointer: InMemorySaver):
+    """Subagent that completes synchronously without firing any interrupt."""
+
+    def quick_node(_state):
+        return {"messages": [AIMessage(content=_QUICK_MARKER)]}
+
+    g = StateGraph(_SubState)
+    g.add_node("quick", quick_node)
+    g.add_edge(START, "quick")
+    g.add_edge("quick", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+def _build_pausing_subagent(checkpointer: InMemorySaver):
+    """Subagent that pauses with a single-action HITL bundle and records its resume payload."""
+
+    def hitl_node(_state):
+        decision = interrupt(
+            {
+                "action_requests": [
+                    {"name": "act_0", "args": {"i": 0}, "description": ""}
+                ],
+                "review_configs": [
+                    {
+                        "action_name": "act_0",
+                        "allowed_decisions": ["approve", "reject", "edit"],
+                    }
+                ],
+            }
+        )
+        return {"messages": [AIMessage(content=json.dumps(decision, sort_keys=True))]}
+
+    g = StateGraph(_SubState)
+    g.add_node("hitl", hitl_node)
+    g.add_edge(START, "hitl")
+    g.add_edge("hitl", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+def _parent_with_two_branches(task_tool, *, dispatches, checkpointer):
+    def fanout(_state) -> list[Send]:
+        return [Send("call_task", d) for d in dispatches]
+
+    async def call_task(state: _DispatchState, config: RunnableConfig):
+        rt = ToolRuntime(
+            state=state,
+            config=config,
+            context=None,
+            stream_writer=None,
+            tool_call_id=state["tcid"],
+            store=None,
+        )
+        return await task_tool.coroutine(
+            description=state["desc"], subagent_type=state["subtype"], runtime=rt
+        )
+
+    g = StateGraph(_DispatchState)
+    g.add_node("call_task", call_task)
+    g.add_conditional_edges(START, fanout, ["call_task"])
+    g.add_edge("call_task", END)
+    return g.compile(checkpointer=checkpointer)
+
+
+def _quick_marker_count(state) -> int:
+    """How many messages anywhere in parent state contain the quick subagent's marker."""
+    n = 0
+    for msg in state.values.get("messages", []) or []:
+        content = getattr(msg, "content", "")
+        if isinstance(content, str) and _QUICK_MARKER in content:
+            n += 1
+    return n
+
+
+@pytest.mark.asyncio
+async def test_partial_pause_routes_only_to_paused_branch_without_rerunning_completed_one():
+    """One branch completes synchronously; the other pauses with HITL — resume routes only to B.
+
+    Setup:
+    - Sub-A (``quick``): no interrupt, finishes immediately, writes a marker
+      message to parent state.
+    - Sub-B (``pausing``): interrupts with a 1-action HITL bundle.
+
+    At pause, parent state has A's marker already merged in and exactly one
+    pending interrupt (B's). Resume sends a 1-element flat decisions list;
+    the routing helpers must not look up A in the resume map (would explode
+    with an unknown ``Interrupt.id``) and must not re-invoke A on resume
+    (would duplicate the marker).
+    """
+    checkpointer = InMemorySaver()
+
+    quick_sub = _build_quick_subagent(checkpointer)
+    pausing_sub = _build_pausing_subagent(checkpointer)
+
+    task_tool = build_task_tool_with_parent_config(
+        [
+            {"name": "quick-agent", "description": "instant", "runnable": quick_sub},
+            {
+                "name": "pausing-agent",
+                "description": "needs review",
+                "runnable": pausing_sub,
+            },
+        ]
+    )
+
+    parent = _parent_with_two_branches(
+        task_tool,
+        dispatches=[
+            {"tcid": "tcid-A", "subtype": "quick-agent", "desc": "do A fast"},
+            {
+                "tcid": "tcid-B",
+                "subtype": "pausing-agent",
+                "desc": "needs approval",
+            },
+        ],
+        checkpointer=checkpointer,
+    )
+
+    config: dict = {
+        "configurable": {"thread_id": "partial-pause-thread"},
+        "recursion_limit": 100,
+    }
+    await parent.ainvoke({"messages": [HumanMessage(content="seed")]}, config)
+
+    paused = await parent.aget_state(config)
+
+    assert len(paused.interrupts) == 1, (
+        f"REGRESSION: expected exactly 1 pending interrupt (sub-B alone), "
+        f"got {len(paused.interrupts)}"
+    )
+
+    pending = collect_pending_tool_calls(paused)
+    assert pending == [("tcid-B", 1)], (
+        f"REGRESSION: pending list contains stale tcids; got {pending!r}"
+    )
+
+    pre_resume_marker_count = _quick_marker_count(paused)
+    assert pre_resume_marker_count == 1, (
+        f"REGRESSION: sub-A's contribution missing or duplicated at pause "
+        f"(found {pre_resume_marker_count}, expected 1)"
+    )
+
+    flat_decisions = [{"type": "approve"}]
+    by_tool_call_id = slice_decisions_by_tool_call(flat_decisions, pending)
+    assert by_tool_call_id == {"tcid-B": {"decisions": [{"type": "approve"}]}}, (
+        f"REGRESSION: slicer routed to a non-pending tcid: {by_tool_call_id!r}"
+    )
+
+    config["configurable"]["surfsense_resume_value"] = by_tool_call_id
+    lg_resume_map = build_lg_resume_map(paused, by_tool_call_id)
+
+    assert set(lg_resume_map.keys()) == {paused.interrupts[0].id}, (
+        f"REGRESSION: resume map keyed by an unknown Interrupt.id "
+        f"(would crash langgraph): {lg_resume_map!r}"
+    )
+
+    await parent.ainvoke(Command(resume=lg_resume_map), config)
+
+    final = await parent.aget_state(config)
+    assert not final.interrupts, (
+        f"REGRESSION: pending interrupts after resume: {final.interrupts!r}"
+    )
+
+    post_resume_marker_count = _quick_marker_count(final)
+    assert post_resume_marker_count == 1, (
+        f"REGRESSION: sub-A re-ran on resume (marker count went "
+        f"{pre_resume_marker_count} → {post_resume_marker_count}); "
+        f"resume must touch only the paused branch."
+    )
+
+    payloads: list[dict] = []
+    for msg in final.values.get("messages", []) or []:
+        content = getattr(msg, "content", None)
+        if isinstance(content, str):
+            try:
+                payloads.append(json.loads(content))
+            except json.JSONDecodeError:
+                pass
+
+    assert {"decisions": [{"type": "approve"}]} in payloads, (
+        f"REGRESSION: sub-B did not receive its single approve on resume; "
+        f"payloads seen: {payloads!r}"
+    )