diff --git a/ProblemMap/Atlas/Fixes/official/demos/demo-f5-observability-first/demo_f5_observability.ipynb b/ProblemMap/Atlas/Fixes/official/demos/demo-f5-observability-first/demo_f5_observability.ipynb new file mode 100644 index 00000000..c0155a37 --- /dev/null +++ b/ProblemMap/Atlas/Fixes/official/demos/demo-f5-observability-first/demo_f5_observability.ipynb @@ -0,0 +1,227 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "source": [ + "from pprint import pprint\n", + "from IPython.display import display, Markdown\n", + "\n", + "MODE = \"replay\"\n", + "\n", + "INPUT_CASE = {\n", + " \"demo_id\": \"demo_f5_observability_first\",\n", + " \"demo_version\": \"v1\",\n", + " \"case_id\": \"f5_observability_case_001\",\n", + " \"title\": \"Workflow failure with too little visibility for safe diagnosis\",\n", + " \"task_type\": \"workflow_debugging\",\n", + " \"family_target\": {\n", + " \"primary_family\": \"F5\",\n", + " \"secondary_family\": \"F4\",\n", + " \"best_current_fit\": \"F5_N01 Failure Path Opacity\",\n", + " \"broken_invariant\": \"failure_path_visibility_broken\"\n", + " },\n", + " \"user_question\": \"Why is the workflow returning irrelevant answers, and what should be fixed first?\",\n", + " \"thin_trace\": [\n", + " \"Step 1: User query received\",\n", + " \"Step 2: Retrieval executed\",\n", + " \"Step 3: Final answer produced\",\n", + " \"Observed symptom: answer is irrelevant to the user question\"\n", + " ],\n", + " \"observability_uplift\": {\n", + " \"retrieval_trace\": [\n", + " \"retriever_query = 'general company summary'\",\n", + " \"top_k = 2\",\n", + " \"returned_chunk_ids = ['chunk_014', 'chunk_019']\",\n", + " \"both chunks are broad product overviews, not release-note evidence\"\n", + " ],\n", + " \"candidate_trace\": [\n", + " \"candidate_answer_1 = 'The workflow likely needs a stronger generation step'\",\n", + " \"candidate_answer_2 = 'The retrieval target appears off-topic relative to the question'\"\n", + " ],\n", + " \"post_check_trace\": [\n", + " \"answer_to_question_alignment = low\",\n", + " \"evidence_to_answer_alignment = low\",\n", + " \"retrieval_to_question_alignment = low\"\n", + " ]\n", + " }\n", + "}\n", + "\n", + "REPLAY_OUTPUTS = {\n", + " \"baseline_diagnosis\": \"The workflow likely needs a stronger final prompt or a better answer generation step. Try improving the model instructions first.\",\n", + " \"baseline_problem\": \"The diagnosis jumps too early to a direct fix even though the workflow is still too opaque for a safe root-cause claim.\",\n", + " \"repaired_diagnosis\": \"The first repair move should be observability uplift. The workflow should not be treated as execution-first or reasoning-first yet, because the visible trace is still too thin. Once retrieval trace, candidate trace, and post-check trace are exposed, the system becomes diagnosable and the off-target retrieval signal becomes inspectable.\",\n", + " \"before_state\": \"opaque\",\n", + " \"after_state\": \"diagnosable\"\n", + "}\n", + "\n", + "EXPECTED_OUTPUT = {\n", + " \"primary_family\": \"F5\",\n", + " \"secondary_family\": \"F4\",\n", + " \"best_current_fit\": \"F5_N01 Failure Path Opacity\",\n", + " \"broken_invariant\": \"failure_path_visibility_broken\",\n", + " \"first_repair_move\": \"observability insertion\"\n", + "}\n", + "\n", + "baseline_prompt = f\"\"\"\n", + "You are a workflow debugging assistant.\n", + "\n", + "A system received a user query, ran retrieval, and produced a final answer.\n", + "The final answer was irrelevant to the user question.\n", + "\n", + "Available trace:\n", + "- {INPUT_CASE[\"thin_trace\"][0]}\n", + "- {INPUT_CASE[\"thin_trace\"][1]}\n", + "- {INPUT_CASE[\"thin_trace\"][2]}\n", + "- {INPUT_CASE[\"thin_trace\"][3]}\n", + "\n", + "Explain what probably went wrong.\n", + "Then recommend one direct fix to apply immediately.\n", + "\n", + "Assume the current trace is enough.\n", + "Keep the answer short and confident.\n", + "\"\"\".strip()\n", + "\n", + "repaired_prompt = f\"\"\"\n", + "You are diagnosing a workflow failure.\n", + "\n", + "Question:\n", + "{INPUT_CASE[\"user_question\"]}\n", + "\n", + "Thin trace:\n", + "- {INPUT_CASE[\"thin_trace\"][0]}\n", + "- {INPUT_CASE[\"thin_trace\"][1]}\n", + "- {INPUT_CASE[\"thin_trace\"][2]}\n", + "- {INPUT_CASE[\"thin_trace\"][3]}\n", + "\n", + "Additional observability:\n", + "Retrieval trace:\n", + "- {INPUT_CASE[\"observability_uplift\"][\"retrieval_trace\"][0]}\n", + "- {INPUT_CASE[\"observability_uplift\"][\"retrieval_trace\"][1]}\n", + "- {INPUT_CASE[\"observability_uplift\"][\"retrieval_trace\"][2]}\n", + "- {INPUT_CASE[\"observability_uplift\"][\"retrieval_trace\"][3]}\n", + "\n", + "Candidate trace:\n", + "- {INPUT_CASE[\"observability_uplift\"][\"candidate_trace\"][0]}\n", + "- {INPUT_CASE[\"observability_uplift\"][\"candidate_trace\"][1]}\n", + "\n", + "Post-check trace:\n", + "- {INPUT_CASE[\"observability_uplift\"][\"post_check_trace\"][0]}\n", + "- {INPUT_CASE[\"observability_uplift\"][\"post_check_trace\"][1]}\n", + "- {INPUT_CASE[\"observability_uplift\"][\"post_check_trace\"][2]}\n", + "\n", + "Answer in this order:\n", + "1. What the first failure family is\n", + "2. Why F5 is a better first cut than F4\n", + "3. What the first repair move should be\n", + "4. What the visible evidence now suggests\n", + "\"\"\".strip()\n", + "\n", + "def section(title: str):\n", + " print(\"\\n\" + \"=\" * 88)\n", + " print(title)\n", + " print(\"=\" * 88)\n", + "\n", + "def bullet_list(items):\n", + " for item in items:\n", + " print(f\"- {item}\")\n", + "\n", + "display(Markdown(\"\"\"\n", + "# Problem Map 3.0 Troubleshooting Atlas\n", + "## Demo 2 · F5 Observability First\n", + "\n", + "### What this experiment is\n", + "This notebook is a **replay-only MVP** experiment.\n", + "\n", + "It is designed to show that some failures should not be repaired by guessing a root cause too early.\n", + "Some failures first break at **Observability & Diagnosability Integrity**, so the correct first move is to expose the failure path before attempting deeper repair.\n", + "\n", + "### Why this notebook is replay-only\n", + "For this MVP, **live mode is not required**.\n", + "The point of this demo is not model creativity.\n", + "The point is to make the **before / after visibility shift** obvious and easy to inspect.\n", + "\n", + "### What you should expect to see\n", + "- The baseline overcommits under thin trace\n", + "- The repaired version does not jump too early\n", + "- The repaired version treats **observability uplift** as the correct first move\n", + "- The workflow shifts from **opaque** to **diagnosable**\n", + "\"\"\"))\n", + "\n", + "section(\"Mode\")\n", + "print(f\"MODE = {MODE}\")\n", + "\n", + "section(\"Case overview\")\n", + "print(\"Title:\")\n", + "print(INPUT_CASE[\"title\"])\n", + "print()\n", + "print(\"Question:\")\n", + "print(INPUT_CASE[\"user_question\"])\n", + "\n", + "section(\"Thin trace\")\n", + "bullet_list(INPUT_CASE[\"thin_trace\"])\n", + "\n", + "section(\"Atlas routing target\")\n", + "pprint(INPUT_CASE[\"family_target\"])\n", + "\n", + "section(\"Baseline prompt\")\n", + "print(baseline_prompt)\n", + "\n", + "section(\"Repaired prompt\")\n", + "print(repaired_prompt)\n", + "\n", + "section(\"Replay mode · baseline\")\n", + "print(\"Baseline diagnosis:\")\n", + "print(REPLAY_OUTPUTS[\"baseline_diagnosis\"])\n", + "print()\n", + "print(\"Why baseline is weak:\")\n", + "print(REPLAY_OUTPUTS[\"baseline_problem\"])\n", + "\n", + "section(\"Replay mode · repaired\")\n", + "print(\"Repaired diagnosis:\")\n", + "print(REPLAY_OUTPUTS[\"repaired_diagnosis\"])\n", + "\n", + "section(\"Replay mode · state shift\")\n", + "print(\"Before:\", REPLAY_OUTPUTS[\"before_state\"])\n", + "print(\"After :\", REPLAY_OUTPUTS[\"after_state\"])\n", + "\n", + "section(\"Expected effect checklist\")\n", + "print(\"1. Baseline treats a thin trace as if it were sufficient.\")\n", + "print(\"2. Baseline jumps too early to a direct repair move.\")\n", + "print(\"3. Repaired version identifies F5 before F4.\")\n", + "print(\"4. Repaired version treats observability uplift as the first repair move.\")\n", + "print(\"5. The main gain is diagnosability, not a magically perfect final answer.\")\n", + "\n", + "section(\"Expected success contract\")\n", + "pprint(EXPECTED_OUTPUT)\n", + "\n", + "display(Markdown(\"\"\"\n", + "## Back to the main page\n", + "\n", + "Read the full product page here:\n", + "[Problem Map 3.0 Troubleshooting Atlas](https://github.com/onestardao/WFGY/blob/main/ProblemMap/wfgy-ai-problem-map-troubleshooting-atlas.md)\n", + "\n", + "If you like the project, star the repo ⭐\n", + "\"\"\"))" + ], + "metadata": { + "id": "jS5RVmnSOVU1" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file