{ "version": "wfgy_problem_catalog_v1", "description": "Machine-readable problem catalog for the WFGY RAG 16 Problem Map. Wave 0 focuses on modes 1, 2, 5, and 8.", "last_updated_utc": "2026-03-02T00:00:00Z", "lanes": { "IN": { "name": "input_retrieval", "description": "Retrieval, input preparation, and evidence selection." }, "RE": { "name": "reasoning_planning", "description": "Reasoning, interpretation of evidence, and planning." }, "ST": { "name": "state_context", "description": "State, memory, and long-context behavior." }, "OP": { "name": "infra_deploy", "description": "Infrastructure, deployment, and environment setup." } }, "types": { "R": { "name": "retrieval", "description": "The primary issue is in retrieval or input selection." }, "P": { "name": "prompt_reasoning", "description": "The primary issue is in prompting, reasoning, or interpretation." }, "S": { "name": "state_memory", "description": "The primary issue is in state, memory, or multi-run behavior." }, "I": { "name": "infra", "description": "The primary issue is in infra, deployment, or observability." } }, "wave0_scope": { "description": "Wave 0 focuses on modes that are both observable and suitable for constrained automated repair loops.", "modes": [1, 2, 5, 8] }, "modes": [ { "id": 1, "code": "IN-1", "short_name": "retrieval_hallucination_drift", "lane": "IN", "default_type": "R", "labels": { "title": "Hallucination / drift with wrong or off-topic evidence", "symptom": "Retrieved evidence E is wrong, off-topic, or misaligned with Q. The answer A confidently follows the bad evidence." }, "signals": { "primary": [ "Q and E describe different topics, entities, or time ranges.", "A is reasonably consistent with E but inconsistent with the true intent of Q.", "Changing retrieval parameters or index often changes the failure pattern significantly." ], "secondary": [ "ΔS(Q, E) is in a high-risk or danger zone when embeddings are available.", "Small changes to Q cause large jumps in which documents are retrieved.", "The system returns outdated or wrong-tenant documents even when the correct documents exist in the index." ] }, "failure_pattern": { "type_hint": "R", "description": "The model is not purely inventing answers. It is aligning to the evidence it sees, but the evidence itself is wrong or irrelevant." }, "fix_focus": [ "Review retrieval configuration: index, filters, top-k, and scoring.", "Check coverage: ensure that relevant documents actually exist and are indexed.", "Add or refine routing / query rewrite rules for high-risk queries.", "Introduce sanity checks when E and Q clearly refer to different topics or time windows." ], "verification_hints": [ "Re-run the same Q after retrieval changes and confirm that E now contains the correct or much closer documents.", "Check that A no longer cites off-topic or outdated documents.", "Track how often ΔS(Q, E) moves from risk/danger into safe/transit zones after the fix, if embeddings are available." ], "automation": { "wave0_supported": true, "automation_notes": [ "Good candidate for automated top-k sweeps, query rewrite experiments, and retrieval configuration search.", "Can be paired with a simple verification loop that checks whether the retrieved documents are on-topic before generating A." ] } }, { "id": 2, "code": "RE-2", "short_name": "misread_evidence", "lane": "RE", "default_type": "P", "labels": { "title": "Misread or misinterpreted evidence", "symptom": "Evidence E is relevant and mostly correct, but the answer A misreads, overstates, or ignores key parts of E." }, "signals": { "primary": [ "E contains the correct information, but A contradicts or distorts it.", "A selectively quotes E but ignores conditions, caveats, or time ranges.", "A blends multiple pieces of evidence into an answer that does not match any single source." ], "secondary": [ "When asked to explicitly quote sources or cite specific sentences, the model improves significantly.", "Rephrasing Q as a verification or comparison question reduces the error rate.", "The same E, when inspected by a human, is enough to answer correctly." ] }, "failure_pattern": { "type_hint": "P", "description": "The retrieval step did its job. The failure arises from how the model reads, weighs, or uses the evidence." }, "fix_focus": [ "Switch to evidence-first answer patterns: quote or summarize E before making claims.", "Enforce citation-before-claim or similar constraints in the prompt template.", "Ask the model to enumerate candidate answers with supporting spans from E before choosing a final answer.", "Reduce room for speculation by tightening instructions around what counts as an acceptable answer." ], "verification_hints": [ "Check that A now includes explicit links to spans or sentences in E.", "Compare old vs new answers and confirm that contradictions with E are reduced.", "Use spot checks where humans verify that A no longer adds unsupported claims beyond E." ], "automation": { "wave0_supported": true, "automation_notes": [ "Good candidate for constrained prompt repair and template-level changes.", "Pairs well with automatic checks that require each major claim to reference a specific part of E." ] } }, { "id": 5, "code": "IN-5", "short_name": "embedding_false_positive", "lane": "IN", "default_type": "R", "labels": { "title": "Embedding false positive", "symptom": "Similarity scores between Q and E are high, but E is semantically off or irrelevant when inspected by humans." }, "signals": { "primary": [ "E has high embedding scores relative to Q but looks wrong or off-topic on manual inspection.", "Retrieval often returns near-duplicate or overly generic documents instead of specific, relevant ones.", "Changing the embedding model or normalization changes which documents appear, even with the same index and Q." ], "secondary": [ "Similar failure patterns show up across many queries using the same embedding model.", "Reranking with a different model improves relevance without changing the base retriever.", "Chunking strategies and document boundaries strongly influence which false positives appear." ] }, "failure_pattern": { "type_hint": "R", "description": "The similarity geometry is misleading. The distance metric or embedding representation is pulling in the wrong neighbors." }, "fix_focus": [ "Review embedding model choice and consider a more domain-aligned model.", "Apply normalization or scaling strategies that better separate relevant vs irrelevant candidates.", "Tune chunking granularity to avoid mixing multiple topics into a single chunk.", "Introduce a reranking step that can reject semantically off-target results even when similarity is high." ], "verification_hints": [ "Measure how often clearly off-topic chunks appear in the top-k set before and after changes.", "Track relevance labels (even if weak) to verify fewer embedding-driven false positives.", "Check that domain-specific queries now retrieve documents that match terminology and intent more precisely." ], "automation": { "wave0_supported": true, "automation_notes": [ "Suitable for automated experiments that compare different embedding models, normalization schemes, and rerankers.", "Can be combined with a small labeled or heuristic relevance set to automatically score before/after changes." ] } }, { "id": 8, "code": "IN-8", "short_name": "no_evidence_visibility", "lane": "IN", "default_type": "I", "labels": { "title": "No evidence visibility", "symptom": "The system cannot reliably inspect or log E. Evidence is missing, truncated, or invisible to the debugging process." }, "signals": { "primary": [ "Logs or traces do not include the retrieved evidence, only the final answer A.", "Different runs appear to produce different answers without a clear view of which E was used.", "Production systems use retrieval, but no stable mechanism exists to see what was actually retrieved." ], "secondary": [ "Replaying the same Q in a controlled environment yields different or non-reproducible behavior.", "Monitoring shows calls to a retriever service, but payloads are not recorded.", "Engineers cannot easily attach E when filing a bug report for a bad answer." ] }, "failure_pattern": { "type_hint": "I", "description": "The main obstacle is observability. Retrieval may be fine or broken, but there is no reliable way to see or export E." }, "fix_focus": [ "Add structured logging or tracing for E with appropriate privacy and security controls.", "Standardize a minimal case packet that always includes Q, E, P, and A for debugging.", "Ensure that tracing covers all relevant environments: dev, staging, and production.", "Introduce feature flags that allow temporary deep logging for high-value incidents." ], "verification_hints": [ "Confirm that new logs contain Q, E, P, and A in a reproducible format.", "Check that a failing production case can be exported into a single case JSON and replayed.", "Ensure that teams can now share minimal failure packets without screenshots or ad-hoc copy/paste." ], "automation": { "wave0_supported": true, "automation_notes": [ "Good candidate for automated checks that validate whether Q, E, P, and A are present in tracing payloads.", "Pairs well with CI or canary tests that ensure critical routes always emit a minimal debug packet when enabled." ] } } ] }