From f27616ead1ce0e4e98abdc50cd20e9726f05358f Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 6 Feb 2026 00:39:39 +0000
Subject: [PATCH] feat: Add quantum simulation engine ADR series (QE-001 to
 QE-012) and DDD design documents

Comprehensive architecture decision records and domain-driven design documentation
for integrating a Rust-based quantum simulation engine (ruQu) into the ruVector stack.

ADR Series (12 documents):
- QE-001: Core Architecture - pure Rust state-vector simulator decision
- QE-002: Crate Structure - three-crate architecture (ruqu-core, ruqu-wasm, ruqu-algorithms)
- QE-003: WASM Compilation - WebAssembly strategy with 25-qubit limit enforcement
- QE-004: Performance Optimization - SIMD, multithreading, gate fusion, benchmarks
- QE-005: VQE Algorithm - variational eigensolver with exact expectation values
- QE-006: Grover Search - O(1) oracle optimization via direct state vector access
- QE-007: QAOA MaxCut - graph-based optimization with Rzz native gates
- QE-008: Surface Code Error Correction - mid-circuit measurement, syndrome extraction
- QE-009: Tensor Network Evaluation - MPS/contraction for shallow circuits
- QE-010: Observability & Monitoring - metrics, tracing, health checks integration
- QE-011: Memory Gating & Power Management - zero-idle, on-demand allocation
- QE-012: Min-Cut Coherence Integration - syndrome-to-decoder bridge with ruQu

DDD Design (3 documents):
- Strategic Design: 6 bounded contexts, context map, ubiquitous language
- Tactical Design: 6 aggregates, 20+ value objects, 15+ domain events, services
- Integration Patterns: anti-corruption layers, shared kernel, event flows

https://claude.ai/code/session_01B1NkbLDWYPaacS9miKsnvW
---
 ...QE-001-quantum-engine-core-architecture.md |  305 ++++
 .../ADR-QE-002-crate-structure-integration.md |  474 ++++++
 .../ADR-QE-003-wasm-compilation-strategy.md   |  459 ++++++
 ...004-performance-optimization-benchmarks.md |  564 +++++++
 .../ADR-QE-005-vqe-algorithm-support.md       |  650 ++++++++
 ...ADR-QE-006-grover-search-implementation.md |  562 +++++++
 .../ADR-QE-007-qaoa-maxcut-implementation.md  |  631 ++++++++
 ...DR-QE-008-surface-code-error-correction.md |  997 ++++++++++++
 .../ADR-QE-009-tensor-network-evaluation.md   |  480 ++++++
 .../ADR-QE-010-observability-monitoring.md    |  689 ++++++++
 ...R-QE-011-memory-gating-power-management.md |  628 ++++++++
 ...ADR-QE-012-mincut-coherence-integration.md |  876 ++++++++++
 .../quantum-engine-ddd-integration.md         |  816 ++++++++++
 .../quantum-engine-ddd-strategic.md           |  530 ++++++
 .../quantum-engine-ddd-tactical.md            | 1426 +++++++++++++++++
 15 files changed, 10087 insertions(+)
 create mode 100644 docs/adr/quantum-engine/ADR-QE-001-quantum-engine-core-architecture.md
 create mode 100644 docs/adr/quantum-engine/ADR-QE-002-crate-structure-integration.md
 create mode 100644 docs/adr/quantum-engine/ADR-QE-003-wasm-compilation-strategy.md
 create mode 100644 docs/adr/quantum-engine/ADR-QE-004-performance-optimization-benchmarks.md
 create mode 100644 docs/adr/quantum-engine/ADR-QE-005-vqe-algorithm-support.md
 create mode 100644 docs/adr/quantum-engine/ADR-QE-006-grover-search-implementation.md
 create mode 100644 docs/adr/quantum-engine/ADR-QE-007-qaoa-maxcut-implementation.md
 create mode 100644 docs/adr/quantum-engine/ADR-QE-008-surface-code-error-correction.md
 create mode 100644 docs/adr/quantum-engine/ADR-QE-009-tensor-network-evaluation.md
 create mode 100644 docs/adr/quantum-engine/ADR-QE-010-observability-monitoring.md
 create mode 100644 docs/adr/quantum-engine/ADR-QE-011-memory-gating-power-management.md
 create mode 100644 docs/adr/quantum-engine/ADR-QE-012-mincut-coherence-integration.md
 create mode 100644 docs/architecture/quantum-engine/quantum-engine-ddd-integration.md
 create mode 100644 docs/architecture/quantum-engine/quantum-engine-ddd-strategic.md
 create mode 100644 docs/architecture/quantum-engine/quantum-engine-ddd-tactical.md

diff --git a/docs/adr/quantum-engine/ADR-QE-001-quantum-engine-core-architecture.md b/docs/adr/quantum-engine/ADR-QE-001-quantum-engine-core-architecture.md
new file mode 100644
index 00000000..d9eece34
--- /dev/null
+++ b/docs/adr/quantum-engine/ADR-QE-001-quantum-engine-core-architecture.md
@@ -0,0 +1,305 @@
+# ADR-QE-001: Quantum Engine Core Architecture
+
+**Status**: Proposed
+**Date**: 2026-02-06
+**Authors**: ruv.io, RuVector Team
+**Deciders**: Architecture Review Board
+
+## Context
+
+### Problem Statement
+
+ruVector needs a quantum simulation engine for on-device quantum algorithm
+experimentation. The platform runs on distributed edge systems, primarily
+targeting Cognitum's 256-core low-power processors, and emphasizes ultra-low-power
+event-driven computing. Quantum simulation is a natural extension of ruVector's
+mathematical computation capabilities: the same SIMD-optimized linear algebra
+that powers vector search and neural inference can drive state-vector manipulation
+for quantum circuits.
+
+### Requirements
+
+The engine must support gate-model quantum circuit simulation up to approximately
+25 qubits, covering the following algorithm families:
+
+| Algorithm Family | Use Case | Typical Qubits | Gate Depth |
+|------------------|----------|-----------------|------------|
+| VQE (Variational Quantum Eigensolver) | Molecular simulation, optimization | 8-20 | 50-500 per iteration |
+| Grover's Search | Unstructured database search | 8-25 | O(sqrt(2^n)) |
+| QAOA (Quantum Approximate Optimization) | Combinatorial optimization | 10-25 | O(p * edges) |
+| Quantum Error Correction | Surface code, stabilizer circuits | 9-25 (logical + ancilla) | Repetitive syndrome rounds |
+
+### Memory Scaling Analysis
+
+Quantum state-vector simulation stores the full amplitude vector of 2^n complex
+numbers. Each amplitude is a pair of f64 values (real + imaginary = 16 bytes).
+Memory grows exponentially:
+
+```
+Qubits  Amplitudes       State Size     With Scratch Buffer
+------  -----------      ----------     -------------------
+10      1,024            16 KB          32 KB
+15      32,768           512 KB         1 MB
+20      1,048,576        16 MB          32 MB
+22      4,194,304        64 MB          128 MB
+24      16,777,216       256 MB         512 MB
+25      33,554,432       512 MB         1.07 GB
+26      67,108,864       1.07 GB        2.14 GB
+28      268,435,456      4.29 GB        8.59 GB
+30      1,073,741,824    17.18 GB       34.36 GB
+```
+
+At 25 qubits the state vector requires approximately 512 MB (1.07 GB with a
+scratch buffer for intermediate calculations). This is the practical ceiling
+for WebAssembly's 32-bit address space. Native execution with sufficient RAM
+can push to 30+ qubits.
+
+### Edge Computing Constraints
+
+Cognitum's 256-core processors operate under strict power and memory budgets:
+
+- **Power envelope**: Event-driven activation; cores idle at near-zero draw
+- **Memory**: Shared pool, typically 2-8 GB per node
+- **Interconnect**: Low-latency mesh between cores, suitable for parallel simulation
+- **Workload model**: Burst computation triggered by agent events, not continuous
+
+The quantum engine must respect this model: allocate state only when a simulation
+is triggered, execute the circuit, return results, and immediately release all
+memory.
+
+## Decision
+
+Implement a **pure Rust state-vector quantum simulator** as a new crate family
+(`ruQu` quantum engine) within the ruVector workspace. The following architectural
+decisions define the engine.
+
+### 1. Pure Rust Implementation (No C/C++ FFI)
+
+The entire simulation engine is written in Rust with no foreign function interface
+dependencies. This ensures:
+
+- Compilation to `wasm32-unknown-unknown` without emscripten or C toolchains
+- Memory safety guarantees throughout the simulation pipeline
+- Unified build system via Cargo across all targets
+- No external library version conflicts or platform-specific linking issues
+
+### 2. State-Vector Simulation as Primary Backend
+
+The engine uses explicit full-amplitude state-vector representation as its
+primary simulation mode. Each gate application transforms the full 2^n
+amplitude vector via matrix-vector multiplication.
+
+```
+Circuit Execution Model:
+
+  |psi_0> ──[H]──[CNOT]──[Rz(theta)]──[Measure]── classical bits
+     |          |            |              |
+     v          v            v              v
+  [init]    [apply_H]   [apply_CNOT]   [apply_Rz]   [sample]
+     |          |            |              |           |
+  2^n f64   2^n f64      2^n f64        2^n f64     collapse
+  complex   complex      complex        complex     to basis
+```
+
+Gate application follows the standard decomposition:
+
+- **Single-qubit gates**: Iterate amplitude pairs (i, i XOR 2^target), apply 2x2
+  unitary. O(2^n) operations per gate.
+- **Two-qubit gates**: Iterate amplitude quadruples, apply 4x4 unitary.
+  O(2^n) operations per gate.
+- **Multi-qubit gates**: Decompose into single and two-qubit gates, or apply
+  directly via 2^k x 2^k matrix on k target qubits.
+
+### 3. Qubit Limits and Precision
+
+| Parameter | WASM Target | Native Target |
+|-----------|-------------|---------------|
+| Max qubits (default) | 25 | 30+ (RAM-dependent) |
+| Max qubits (hard limit) | 26 (with f32) | Memory-limited |
+| Precision (default) | Complex f64 | Complex f64 |
+| Precision (optional) | Complex f32 | Complex f32 |
+| State size at max | ~1.07 GB | ~17 GB at 30 qubits |
+
+Complex f64 is the default precision, providing approximately 15 decimal digits
+of accuracy -- sufficient for quantum chemistry applications and deep circuits
+where accumulated floating-point error matters. An optional f32 mode halves
+memory usage at the cost of precision, suitable for shallow circuits and
+approximate optimization.
+
+### 4. Event-Driven Activation Model
+
+The engine follows ruVector's event-driven philosophy:
+
+```
+Agent Context          ruQu Engine              Memory
+     |                      |                      |
+     |-- trigger(circuit) ->|                      |
+     |                      |-- allocate(2^n) ---->|
+     |                      |<---- state_ptr ------|
+     |                      |                      |
+     |                      |-- [execute gates] -->|
+     |                      |-- [measure] -------->|
+     |                      |                      |
+     |<-- results ---------|                      |
+     |                      |-- deallocate() ----->|
+     |                      |                      |
+   (idle)                (inert)               (freed)
+```
+
+- **Inert by default**: No background threads, no persistent allocations
+- **Allocate on demand**: State vector created when circuit execution begins
+- **Free immediately**: All simulation memory released upon result delivery
+- **No global state**: Multiple concurrent simulations supported via independent
+  state handles (no shared mutable global)
+
+### 5. Dual-Target Compilation
+
+The crate supports two compilation targets from a single codebase:
+
+```
+                    ruqu-core
+                       |
+            +----------+----------+
+            |                     |
+    [native target]       [wasm32-unknown-unknown]
+            |                     |
+    - Full SIMD (AVX2,      - WASM SIMD128
+      AVX-512, NEON)        - 4GB address limit
+    - Rayon threading        - Optional SharedArrayBuffer
+    - Optional GPU (wgpu)    - No GPU
+    - 30+ qubits             - 25 qubit ceiling
+    - Full OS integration    - Sandboxed
+```
+
+Conditional compilation via Cargo feature flags controls target-specific code
+paths. The public API surface is identical across targets.
+
+### 6. Optional Tensor Network Mode
+
+For circuits with limited entanglement (e.g., shallow QAOA, certain VQE
+ansatze), the engine offers an optional tensor network backend:
+
+- Represents the quantum state as a network of tensors rather than a single
+  exponential vector
+- Memory scales as O(n * chi^2) where chi is the bond dimension (maximum
+  entanglement width)
+- Efficient for circuits where entanglement grows slowly or remains bounded
+- Falls back to full state-vector when bond dimension exceeds threshold
+- Enabled via the `tensor-network` feature flag
+
+## Alternatives Considered
+
+### Alternative 1: Qukit (Rust, WASM-ready)
+
+A pre-1.0 Rust quantum simulator with WASM support.
+
+| Criterion | Assessment |
+|-----------|------------|
+| Maturity | Pre-1.0, limited community |
+| WASM support | Present but untested at scale |
+| Optimization | Basic; no SIMD, no gate fusion |
+| Integration | Would require adapter layer |
+| Maintenance | External dependency risk |
+
+**Rejected**: Insufficient optimization depth and maturity for production use.
+
+### Alternative 2: QuantRS2 (Rust, Python-focused)
+
+A Rust quantum simulator primarily targeting Python bindings via PyO3.
+
+| Criterion | Assessment |
+|-----------|------------|
+| Performance | Good benchmarks on native |
+| WASM support | Not a design target |
+| Dependencies | Heavy; Python-oriented build |
+| API design | Python-first, Rust API secondary |
+| Integration | Significant impedance mismatch |
+
+**Rejected**: Python-centric design creates unnecessary weight and integration
+friction for a Rust-native edge system.
+
+### Alternative 3: roqoqo + QuEST (Rust frontend, C backend)
+
+roqoqo provides a Rust circuit description layer; QuEST is a high-performance
+C/C++ state-vector simulator.
+
+| Criterion | Assessment |
+|-----------|------------|
+| Performance | Excellent (QuEST is highly optimized) |
+| WASM support | QuEST's C code breaks WASM compilation |
+| Maintenance | External C library maintenance burden |
+| Memory safety | C backend outside Rust safety guarantees |
+
+**Rejected**: C dependency is incompatible with WASM target requirement.
+
+### Alternative 4: Quant-Iron (Rust + OpenCL)
+
+A Rust simulator leveraging OpenCL for GPU acceleration.
+
+| Criterion | Assessment |
+|-----------|------------|
+| Performance | Excellent on GPU-equipped hardware |
+| WASM support | OpenCL incompatible with WASM |
+| Edge deployment | Most edge nodes lack discrete GPUs |
+| Complexity | OpenCL runtime adds operational burden |
+
+**Rejected**: OpenCL dependency incompatible with WASM and edge deployment model.
+
+### Alternative 5: No Simulator (Cloud Quantum APIs)
+
+Delegate all quantum computation to cloud-based quantum simulators or hardware.
+
+| Criterion | Assessment |
+|-----------|------------|
+| Performance | Network-bound latency |
+| Offline support | None; requires connectivity |
+| Cost | Per-execution charges |
+| Privacy | Circuit data sent to third party |
+| Edge philosophy | Violates offline-first design |
+
+**Rejected**: Fundamentally incompatible with ruVector's offline-first edge
+computing philosophy.
+
+## Consequences
+
+### Positive
+
+- **Full control**: Complete ownership of the simulation pipeline, enabling
+  deep integration with ruVector's math, SIMD, and memory subsystems
+- **WASM portable**: Single codebase compiles to any WASM runtime, enabling
+  browser-based quantum experimentation
+- **No external dependencies**: Eliminates supply chain risk from C/C++ or
+  Python library dependencies
+- **Edge-aligned**: Event-driven activation model matches Cognitum's power
+  architecture
+- **Extensible**: Gate set, noise models, and backends can evolve independently
+
+### Negative
+
+- **Development effort**: Building a competitive quantum simulator from scratch
+  requires significant engineering investment
+- **Maintenance burden**: Team must benchmark, optimize, and maintain the
+  simulation engine alongside the rest of ruVector
+- **Classical simulation limits**: Exponential scaling is a fundamental physics
+  constraint; the engine cannot exceed ~30 qubits on practical hardware
+
+### Risks and Mitigations
+
+| Risk | Likelihood | Impact | Mitigation |
+|------|-----------|--------|------------|
+| Performance below competitors | Medium | High | Benchmark-driven development against QuantRS2/Qukit |
+| Floating-point accuracy drift | Low | Medium | Comprehensive numerical tests, optional f64 enforcement |
+| WASM memory exhaustion | Medium | Medium | Hard qubit limit with clear error messages (ADR-QE-003) |
+| Scope creep into hardware simulation | Low | Low | Strict scope: gate-model only, no analog/pulse simulation |
+
+## References
+
+- [ADR-005: WASM Runtime Integration](/docs/adr/ADR-005-wasm-runtime-integration.md)
+- [ADR-003: SIMD Optimization Strategy](/docs/adr/ADR-003-simd-optimization-strategy.md)
+- [ADR-006: Memory Management](/docs/adr/ADR-006-memory-management.md)
+- [ADR-014: Coherence Engine](/docs/adr/ADR-014-coherence-engine.md)
+- [ADR-QE-002: Crate Structure & Integration](./ADR-QE-002-crate-structure-integration.md)
+- [ADR-QE-003: WASM Compilation Strategy](./ADR-QE-003-wasm-compilation-strategy.md)
+- [ADR-QE-004: Performance Optimization & Benchmarks](./ADR-QE-004-performance-optimization-benchmarks.md)
+- Nielsen & Chuang, "Quantum Computation and Quantum Information" (2010)
+- Aaronson & Gottesman, "Improved simulation of stabilizer circuits" (2004)
diff --git a/docs/adr/quantum-engine/ADR-QE-002-crate-structure-integration.md b/docs/adr/quantum-engine/ADR-QE-002-crate-structure-integration.md
new file mode 100644
index 00000000..226079c6
--- /dev/null
+++ b/docs/adr/quantum-engine/ADR-QE-002-crate-structure-integration.md
@@ -0,0 +1,474 @@
+# ADR-QE-002: Crate Structure & ruVector Integration
+
+**Status**: Proposed
+**Date**: 2026-02-06
+**Authors**: ruv.io, RuVector Team
+**Deciders**: Architecture Review Board
+
+## Context
+
+### Problem Statement
+
+The quantum engine must fit within the ruVector workspace, which currently
+comprises 73+ crates following a consistent modular architecture. The existing
+`ruQu` crate handles classical coherence monitoring -- specifically min-cut
+analysis and MWPM (Minimum Weight Perfect Matching) decoding for error
+correction analysis. The new quantum simulation capability requires clear
+separation from this classical functionality while integrating deeply with
+ruVector's shared infrastructure.
+
+### Existing Workspace Patterns
+
+The ruVector workspace follows established conventions that the quantum engine
+must respect:
+
+```
+ruvector/
+  crates/
+    ruvector-math/          # SIMD-optimized linear algebra
+    ruvector-hnsw/          # Vector similarity search
+    ruvector-metrics/       # Observability and telemetry
+    ruvector-router-wasm/   # WASM bindings for routing
+    ruQu/                   # Classical coherence (min-cut, MWPM)
+    ...73+ crates
+  Cargo.toml                # Workspace root
+```
+
+Key conventions observed:
+
+- **`no_std` + `alloc`** for maximum portability
+- **Feature flags** for optional capabilities (parallel, gpu, etc.)
+- **Separate WASM crates** for browser-facing bindings (e.g., `ruvector-router-wasm`)
+- **Metrics integration** via `ruvector-metrics` for observability
+- **SIMD reuse** via `ruvector-math` for hot-path computations
+
+### Integration Points
+
+The quantum engine must interact with several existing subsystems:
+
+```
+                    +-------------------+
+                    |  Agent Framework  |
+                    +--------+----------+
+                             |
+                    trigger circuit execution
+                             |
+                    +--------v----------+
+                    |   ruqu-core       |
+                    | (quantum sim)     |
+                    +---+------+--------+
+                        |      |
+             +----------+      +----------+
+             |                            |
+    +--------v--------+       +-----------v---------+
+    | ruvector-math   |       | ruvector-metrics    |
+    | (SIMD, linalg)  |       | (telemetry)         |
+    +-----------------+       +---------------------+
+             |
+    +--------v--------+
+    | ruQu (existing) |
+    | (min-cut, MWPM) |
+    +-----------------+
+```
+
+## Decision
+
+Adopt a **three-crate architecture** for the quantum engine, each with a
+clearly defined responsibility boundary.
+
+### Crate 1: `ruqu-core` -- Pure Rust Simulation Library
+
+The core simulation engine, containing all quantum computation logic.
+
+**Responsibilities**:
+- `QuantumCircuit`: Circuit representation and manipulation
+- `QuantumState`: State-vector storage and operations
+- `Gate` enum: Full gate set (Pauli, Hadamard, CNOT, Toffoli, parametric rotations, etc.)
+- Measurement operations (computational basis, Pauli basis, mid-circuit)
+- Circuit optimization passes (gate fusion, cancellation)
+- Noise model application (optional)
+- Entanglement tracking for state splitting
+
+**Design constraints**:
+- `#![no_std]` with `alloc` for embedded/WASM portability
+- Zero required external dependencies beyond `alloc`
+- All platform-specific code behind feature flags
+
+**Feature flags**:
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `std` | off | Enable std library features (file I/O, advanced error types) |
+| `parallel` | off | Enable Rayon-based multi-threaded gate application |
+| `gpu` | off | Enable wgpu-based GPU acceleration for large states |
+| `tensor-network` | off | Enable tensor network backend for shallow circuits |
+| `noise-model` | off | Enable depolarizing, amplitude damping, and custom noise channels |
+| `f32` | off | Use f32 precision instead of f64 (halves memory, reduces accuracy) |
+| `serde` | off | Enable serialization of circuits and states |
+
+**Module structure**:
+
+```
+ruqu-core/
+  src/
+    lib.rs              # Crate root, feature flag gating
+    state.rs            # QuantumState: amplitude storage, initialization
+    circuit.rs          # QuantumCircuit: gate sequence, metadata
+    gates/
+      mod.rs            # Gate enum and dispatch
+      single.rs         # Single-qubit gates (H, X, Y, Z, S, T, Rx, Ry, Rz, U3)
+      two.rs            # Two-qubit gates (CNOT, CZ, SWAP, Rxx, Ryy, Rzz)
+      multi.rs          # Multi-qubit gates (Toffoli, Fredkin, custom unitaries)
+      parametric.rs     # Parameterized gate support for variational algorithms
+    execution/
+      mod.rs            # Execution engine dispatch
+      statevector.rs    # Full state-vector simulation engine
+      tensor.rs         # Tensor network backend (feature-gated)
+      noise.rs          # Noise channel application (feature-gated)
+    measurement.rs      # Measurement: sampling, expectation values
+    optimize/
+      mod.rs            # Circuit optimization pipeline
+      fusion.rs         # Gate fusion pass
+      cancel.rs         # Gate cancellation (HH=I, XX=I, etc.)
+      commute.rs        # Commutation-based reordering
+    entanglement.rs     # Entanglement tracking and state splitting
+    types.rs            # Complex number types, precision configuration
+    error.rs            # Error types (QubitOverflow, InvalidGate, etc.)
+  Cargo.toml
+  benches/
+    statevector.rs      # Criterion benchmarks for core operations
+```
+
+**Public API surface**:
+
+```rust
+// Core types
+pub struct QuantumState { /* ... */ }
+pub struct QuantumCircuit { /* ... */ }
+pub enum Gate { H, X, Y, Z, S, T, CNOT, CZ, Rx(f64), Ry(f64), Rz(f64), /* ... */ }
+
+// Circuit construction
+impl QuantumCircuit {
+    pub fn new(num_qubits: usize) -> Result<Self, QubitOverflow>;
+    pub fn gate(&mut self, gate: Gate, targets: &[usize]) -> &mut Self;
+    pub fn measure(&mut self, qubit: usize) -> &mut Self;
+    pub fn measure_all(&mut self) -> &mut Self;
+    pub fn barrier(&mut self) -> &mut Self;
+    pub fn depth(&self) -> usize;
+    pub fn gate_count(&self) -> usize;
+    pub fn optimize(&mut self) -> &mut Self;
+}
+
+// Execution
+impl QuantumState {
+    pub fn new(num_qubits: usize) -> Result<Self, QubitOverflow>;
+    pub fn execute(&mut self, circuit: &QuantumCircuit) -> ExecutionResult;
+    pub fn sample(&self, shots: usize) -> Vec<BitString>;
+    pub fn expectation(&self, observable: &Observable) -> f64;
+    pub fn probabilities(&self) -> Vec<f64>;
+    pub fn amplitude(&self, basis_state: usize) -> Complex<f64>;
+}
+```
+
+### Crate 2: `ruqu-wasm` -- WebAssembly Bindings
+
+WASM-specific bindings exposing the quantum engine to JavaScript environments.
+
+**Responsibilities**:
+- wasm-bindgen annotated wrapper types
+- JavaScript-friendly API (string-based circuit construction, JSON results)
+- Memory limit enforcement (reject circuits exceeding WASM address space)
+- Optional multi-threading via wasm-bindgen-rayon
+
+**Design constraints**:
+- Mirrors the `ruvector-router-wasm` crate pattern
+- Thin wrapper; all logic delegated to `ruqu-core`
+- TypeScript type definitions auto-generated
+
+**Module structure**:
+
+```
+ruqu-wasm/
+  src/
+    lib.rs              # wasm-bindgen entry points
+    circuit.rs          # JS-facing QuantumCircuit wrapper
+    state.rs            # JS-facing QuantumState wrapper
+    types.rs            # JS-compatible type conversions
+    limits.rs           # WASM memory limit checks
+  Cargo.toml
+  pkg/                  # wasm-pack output (generated)
+  tests/
+    web.rs              # wasm-bindgen-test browser tests
+```
+
+**JavaScript API**:
+
+```javascript
+import { QuantumCircuit, QuantumState } from 'ruqu-wasm';
+
+// Construct circuit
+const circuit = new QuantumCircuit(4);
+circuit.h(0);
+circuit.cnot(0, 1);
+circuit.cnot(1, 2);
+circuit.cnot(2, 3);
+circuit.measureAll();
+
+// Execute
+const state = new QuantumState(4);
+const result = state.execute(circuit);
+
+// Sample measurement outcomes
+const counts = state.sample(1024);
+console.log(counts);  // { "0000": 512, "1111": 512 }
+
+// Get probabilities
+const probs = state.probabilities();
+```
+
+**Memory limit enforcement**:
+
+```rust
+const WASM_MAX_QUBITS: usize = 25;
+const WASM_MAX_STATE_BYTES: usize = 1 << 30; // 1 GB
+
+pub fn check_wasm_limits(num_qubits: usize) -> Result<(), WasmLimitError> {
+    if num_qubits > WASM_MAX_QUBITS {
+        return Err(WasmLimitError::QubitOverflow {
+            requested: num_qubits,
+            maximum: WASM_MAX_QUBITS,
+            estimated_bytes: 16 * (1usize << num_qubits),
+        });
+    }
+    Ok(())
+}
+```
+
+### Crate 3: `ruqu-algorithms` -- High-Level Algorithm Implementations
+
+Quantum algorithm implementations built on top of `ruqu-core`.
+
+**Responsibilities**:
+- VQE (Variational Quantum Eigensolver) with classical optimizer integration
+- Grover's search with oracle construction helpers
+- QAOA (Quantum Approximate Optimization Algorithm)
+- Quantum error correction (surface codes, stabilizer codes)
+- Hamiltonian simulation primitives (Trotterization)
+
+**Module structure**:
+
+```
+ruqu-algorithms/
+  src/
+    lib.rs
+    vqe/
+      mod.rs            # VQE orchestration
+      ansatz.rs         # Parameterized ansatz circuits (UCCSD, HEA)
+      hamiltonian.rs    # Hamiltonian representation and decomposition
+      optimizer.rs      # Classical optimizer trait + implementations
+    grover/
+      mod.rs            # Grover's algorithm orchestration
+      oracle.rs         # Oracle construction utilities
+      diffusion.rs      # Diffusion operator
+    qaoa/
+      mod.rs            # QAOA orchestration
+      mixer.rs          # Mixer Hamiltonian circuits
+      cost.rs           # Cost function encoding
+    qec/
+      mod.rs            # QEC framework
+      surface.rs        # Surface code implementation
+      stabilizer.rs     # Stabilizer formalism
+      decoder.rs        # Bridge to ruQu's MWPM decoder
+    trotter.rs          # Trotterization for Hamiltonian simulation
+    utils.rs            # Shared utilities (state preparation, etc.)
+  Cargo.toml
+```
+
+**VQE example**:
+
+```rust
+use ruqu_core::{QuantumCircuit, QuantumState};
+use ruqu_algorithms::vqe::{VqeSolver, Hamiltonian, HardwareEfficientAnsatz};
+
+let hamiltonian = Hamiltonian::from_pauli_sum(&[
+    (0.5, "ZZ", &[0, 1]),
+    (0.3, "X",  &[0]),
+    (0.3, "X",  &[1]),
+]);
+
+let ansatz = HardwareEfficientAnsatz::new(2, depth: 3);
+
+let solver = VqeSolver::new(hamiltonian, ansatz)
+    .optimizer(NelderMead::default())
+    .max_iterations(200)
+    .convergence_threshold(1e-6);
+
+let result = solver.solve();
+println!("Ground state energy: {:.6}", result.energy);
+```
+
+### Integration Points
+
+#### Agent Activation
+
+Quantum circuits are triggered via the ruVector agent context system. An agent
+can invoke simulation through graph query extensions:
+
+```
+Agent Query: "Simulate VQE for H2 molecule at bond length 0.74 A"
+    |
+    v
+Agent Framework --> ruqu-algorithms::vqe::VqeSolver
+    |                    |
+    |                    +--> ruqu-core (multiple circuit executions)
+    |                    |
+    |<-- VqeResult ------+
+    |
+    v
+Agent Response: { energy: -1.137, parameters: [...], iterations: 47 }
+```
+
+#### Memory Gating
+
+Following ruVector's memory discipline (ADR-006):
+
+- State vectors allocated exclusively within `QuantumState::new()` scope
+- All amplitudes dropped when `QuantumState` goes out of scope
+- No lazy or cached allocations persist between simulations
+- Peak memory tracked and reported via `ruvector-metrics`
+
+#### Observability
+
+Every simulation reports metrics through the existing `ruvector-metrics` pipeline:
+
+| Metric | Type | Description |
+|--------|------|-------------|
+| `ruqu.simulation.qubits` | Gauge | Number of qubits in current simulation |
+| `ruqu.simulation.gates` | Counter | Total gates applied |
+| `ruqu.simulation.depth` | Gauge | Circuit depth after optimization |
+| `ruqu.simulation.duration_ns` | Histogram | Wall-clock simulation time |
+| `ruqu.simulation.peak_memory_bytes` | Gauge | Peak memory during simulation |
+| `ruqu.optimization.gates_eliminated` | Counter | Gates removed by optimization passes |
+| `ruqu.measurement.shots` | Counter | Total measurement shots taken |
+
+#### Coherence Bridge
+
+The existing `ruQu` crate's min-cut analysis and MWPM decoders remain in place
+and become accessible from `ruqu-algorithms` for quantum error correction:
+
+```
+ruqu-algorithms::qec::surface
+    |
+    +-- build syndrome graph
+    |
+    +-- invoke ruQu::mwpm::decode(syndrome)
+    |
+    +-- apply corrections to ruqu-core::QuantumState
+```
+
+This avoids duplicating decoding logic and leverages the existing, tested
+classical infrastructure.
+
+#### Math Reuse
+
+`ruqu-core` depends on `ruvector-math` for SIMD-optimized operations:
+
+- Complex number arithmetic (add, multiply, conjugate) using SIMD lanes
+- Aligned memory allocation for state vectors
+- Batch operations on amplitude arrays
+- Norm calculation for state normalization
+
+```rust
+// In ruqu-core, gate application uses ruvector-math SIMD utilities
+use ruvector_math::simd::{complex_mul_f64x4, complex_add_f64x4};
+
+fn apply_single_qubit_gate(
+    state: &mut [Complex<f64>],
+    target: usize,
+    matrix: [[Complex<f64>; 2]; 2],
+) {
+    let step = 1 << target;
+    for block in (0..state.len()).step_by(2 * step) {
+        for i in block..block + step {
+            let (a, b) = (state[i], state[i + step]);
+            state[i]        = matrix[0][0] * a + matrix[0][1] * b;
+            state[i + step] = matrix[1][0] * a + matrix[1][1] * b;
+        }
+    }
+}
+```
+
+### Dependency Graph
+
+```
+ruqu-algorithms
+    |
+    +---> ruqu-core
+    |        |
+    |        +---> ruvector-math (SIMD utilities)
+    |        +---> ruvector-metrics (optional, behind "metrics" feature)
+    |
+    +---> ruQu (existing, for MWPM decoders in QEC)
+
+ruqu-wasm
+    |
+    +---> ruqu-core
+    +---> wasm-bindgen
+    +---> wasm-bindgen-rayon (optional, behind "threads" feature)
+```
+
+### Workspace Cargo.toml Additions
+
+```toml
+[workspace]
+members = [
+    # ... existing 73+ crates ...
+    "crates/ruqu-core",
+    "crates/ruqu-wasm",
+    "crates/ruqu-algorithms",
+]
+```
+
+## Consequences
+
+### Positive
+
+- **Clean separation of concerns**: Each crate has a single, well-defined
+  responsibility -- simulation, WASM bindings, and algorithms respectively
+- **Independent testing**: Each crate can be tested in isolation with its own
+  benchmark suite
+- **Minimal WASM surface**: `ruqu-wasm` remains a thin wrapper, keeping the
+  compiled `.wasm` module small
+- **Reuse of infrastructure**: SIMD, metrics, and classical decoders are shared,
+  not duplicated
+- **Follows workspace conventions**: Same patterns as existing crates, reducing
+  onboarding friction for contributors
+
+### Negative
+
+- **Three crates to maintain**: Each requires its own CI, documentation, and
+  version management
+- **Cross-crate API stabilization**: Changes to `ruqu-core`'s public API affect
+  both `ruqu-wasm` and `ruqu-algorithms`
+- **Feature flag combinatorics**: Multiple feature flags across three crates
+  create a testing matrix that must be validated
+
+### Risks and Mitigations
+
+| Risk | Mitigation |
+|------|------------|
+| API churn in ruqu-core destabilizing dependents | Semver discipline; stabilize core types before 1.0 |
+| Feature flag combinations causing compilation failures | CI matrix testing all supported flag combinations |
+| Coherence bridge creating tight coupling with ruQu | Trait-based decoder interface; ruQu dependency optional |
+| WASM crate size exceeding 2MB target | Regular binary size audits; aggressive dead code elimination |
+
+## References
+
+- [ADR-QE-001: Quantum Engine Core Architecture](./ADR-QE-001-quantum-engine-core-architecture.md)
+- [ADR-QE-003: WASM Compilation Strategy](./ADR-QE-003-wasm-compilation-strategy.md)
+- [ADR-QE-004: Performance Optimization & Benchmarks](./ADR-QE-004-performance-optimization-benchmarks.md)
+- [Workspace Cargo.toml](/Cargo.toml)
+- [ruvector-router-wasm pattern](/crates/ruvector-router-wasm/)
+- [ruQu crate](/crates/ruQu/)
+- [ruvector-math crate](/crates/ruvector-math/)
+- [ruvector-metrics crate](/crates/ruvector-metrics/)
diff --git a/docs/adr/quantum-engine/ADR-QE-003-wasm-compilation-strategy.md b/docs/adr/quantum-engine/ADR-QE-003-wasm-compilation-strategy.md
new file mode 100644
index 00000000..680ee47f
--- /dev/null
+++ b/docs/adr/quantum-engine/ADR-QE-003-wasm-compilation-strategy.md
@@ -0,0 +1,459 @@
+# ADR-QE-003: WebAssembly Compilation Strategy
+
+**Status**: Proposed
+**Date**: 2026-02-06
+**Authors**: ruv.io, RuVector Team
+**Deciders**: Architecture Review Board
+
+## Context
+
+### Problem Statement
+
+ruVector targets browsers, embedded/edge runtimes, and IoT devices via
+WebAssembly. The quantum simulation engine must compile to
+`wasm32-unknown-unknown` and run correctly in these constrained environments.
+WASM introduces fundamental constraints that differ significantly from native
+execution and must be addressed at the architectural level rather than
+worked around at runtime.
+
+### WASM Execution Environment Constraints
+
+| Constraint | Detail | Impact on Quantum Simulation |
+|------------|--------|------------------------------|
+| 32-bit address space | ~4 GB theoretical max, ~2 GB practical | Hard ceiling on state vector size |
+| Memory model | Linear memory, grows in 64 KB pages | Allocation must be page-aware |
+| No native threads | Web Workers required for parallelism | Requires SharedArrayBuffer + COOP/COEP headers |
+| No direct GPU | WebGPU is separate API, not WASM-native | GPU acceleration unavailable in WASM path |
+| No OS syscalls | Sandboxed execution, no file/network | All I/O must go through host bindings |
+| JIT compilation | V8/SpiderMonkey JIT, not AOT | ~1.5-3x slower than native, variable warmup |
+| SIMD support | 128-bit SIMD proposal (widely supported since 2021) | 4 f32 or 2 f64 per vector lane |
+| Stack size | Default ~1 MB, configurable | Deep recursion limited |
+
+### Memory Budget Analysis for Quantum Simulation
+
+The critical constraint is WASM's 32-bit address space. With a practical
+usable limit of approximately 2 GB (due to browser memory allocation
+behavior and address space fragmentation), the maximum feasible state vector
+size is bounded:
+
+```
+Available WASM Memory Budget:
+
+  Total addressable:     4,294,967,296 bytes  (4 GB theoretical)
+  Practical usable:     ~2,147,483,648 bytes  (2 GB, browser-dependent)
+  WASM overhead:          ~100,000,000 bytes  (module, stack, heap metadata)
+  Application overhead:    ~50,000,000 bytes  (circuit data, scratch buffers)
+  -------------------------------------------------
+  Available for state:  ~2,000,000,000 bytes  (1.86 GB)
+
+  State vector sizes:
+    24 qubits:  268,435,456 bytes (256 MB)  -- comfortable
+    25 qubits:  536,870,912 bytes (512 MB)  -- feasible
+    25 + scratch: ~1,073,741,824 bytes       -- tight but within budget
+    26 qubits: 1,073,741,824 bytes (1 GB)   -- state alone, no scratch room
+    27 qubits: 2,147,483,648 bytes (2 GB)   -- exceeds practical limit
+```
+
+### Existing WASM Patterns in ruVector
+
+The `ruvector-router-wasm` crate establishes conventions for WASM compilation:
+
+- `wasm-pack build` as the compilation tool
+- `wasm-bindgen` for JavaScript interop
+- TypeScript definition generation
+- Feature-flag controlled inclusion/exclusion of capabilities
+- Dedicated test suites using `wasm-bindgen-test`
+
+## Decision
+
+### 1. Target and Toolchain
+
+**Target triple**: `wasm32-unknown-unknown`
+
+**Build toolchain**: `wasm-pack` with `wasm-bindgen`
+
+```bash
+# Development build
+wasm-pack build crates/ruqu-wasm --target web --dev
+
+# Release build with size optimization
+wasm-pack build crates/ruqu-wasm --target web --release
+
+# Node.js target (for server-side WASM)
+wasm-pack build crates/ruqu-wasm --target nodejs --release
+```
+
+**Cargo profile for WASM release**:
+
+```toml
+[profile.wasm-release]
+inherits = "release"
+opt-level = "z"          # Optimize for binary size
+lto = true               # Link-time optimization
+codegen-units = 1        # Single codegen unit for maximum optimization
+strip = true             # Strip debug symbols
+panic = "abort"          # Smaller panic handling
+```
+
+### 2. Memory Limit Enforcement
+
+`ruqu-wasm` enforces qubit limits before any allocation occurs. This is a hard
+gate, not a soft warning.
+
+**Enforcement strategy**:
+
+```
+User requests N qubits
+        |
+        v
+  [N <= 25?] ---NO---> Return WasmLimitError {
+        |                 requested: N,
+       YES                maximum: 25,
+        |                 estimated_memory: 16 * 2^N,
+        v                 suggestion: "Use native build for >25 qubits"
+  [Estimate total       }
+   memory needed]
+        |
+        v
+  [< 1.5 GB?] ---NO---> Return WasmLimitError::InsufficientMemory
+        |
+       YES
+        |
+        v
+  Proceed with allocation
+```
+
+**Qubit limits by precision**:
+
+| Precision | Max Qubits (WASM) | State Size | With Scratch |
+|-----------|--------------------|------------|--------------|
+| Complex f64 (default) | 25 | 512 MB | ~1.07 GB |
+| Complex f32 (optional) | 26 | 512 MB | ~1.07 GB |
+
+**Error reporting**:
+
+```rust
+#[wasm_bindgen]
+#[derive(Debug)]
+pub struct WasmLimitError {
+    pub requested_qubits: usize,
+    pub maximum_qubits: usize,
+    pub estimated_bytes: usize,
+    pub message: String,
+}
+
+impl WasmLimitError {
+    pub fn qubit_overflow(requested: usize) -> Self {
+        let max = if cfg!(feature = "f32") { 26 } else { 25 };
+        let bytes_per_amplitude = if cfg!(feature = "f32") { 8 } else { 16 };
+        Self {
+            requested_qubits: requested,
+            maximum_qubits: max,
+            estimated_bytes: bytes_per_amplitude * (1usize << requested),
+            message: format!(
+                "Cannot simulate {} qubits in WASM: requires {} bytes, \
+                 exceeds WASM address space. Maximum: {} qubits. \
+                 Use native build for larger simulations.",
+                requested,
+                bytes_per_amplitude * (1usize << requested),
+                max
+            ),
+        }
+    }
+}
+```
+
+### 3. Threading Strategy
+
+WASM multi-threading requires SharedArrayBuffer, which in turn requires
+specific HTTP security headers (Cross-Origin-Opener-Policy and
+Cross-Origin-Embedder-Policy). Not all deployment environments support these.
+
+**Strategy**: Optional multi-threading with graceful fallback.
+
+```
+                  ruqu-wasm execution
+                        |
+                        v
+              [SharedArrayBuffer
+               available?]
+                /           \
+              YES            NO
+              /               \
+    [wasm-bindgen-rayon]    [single-threaded
+     parallel execution]     execution]
+              |                    |
+     Split state vector      Sequential gate
+     across Web Workers      application
+              |                    |
+              v                    v
+         Fast (N cores)     Slower (1 core)
+```
+
+**Compile-time configuration**:
+
+```toml
+# In ruqu-wasm/Cargo.toml
+[features]
+default = []
+threads = ["wasm-bindgen-rayon", "ruqu-core/parallel"]
+```
+
+**Runtime detection**:
+
+```rust
+#[wasm_bindgen]
+pub fn threading_available() -> bool {
+    // Check if SharedArrayBuffer is available in this environment
+    js_sys::eval("typeof SharedArrayBuffer !== 'undefined'")
+        .ok()
+        .and_then(|v| v.as_bool())
+        .unwrap_or(false)
+}
+```
+
+**Required HTTP headers for threading**:
+
+```
+Cross-Origin-Opener-Policy: same-origin
+Cross-Origin-Embedder-Policy: require-corp
+```
+
+### 4. SIMD Utilization
+
+The WASM SIMD proposal (128-bit vectors) is widely supported in modern browsers
+and runtimes. The quantum engine uses SIMD for amplitude manipulation when
+available.
+
+**WASM SIMD capabilities**:
+
+| Operation | WASM SIMD Instruction | Use in Quantum Sim |
+|-----------|-----------------------|--------------------|
+| f64x2 multiply | `f64x2.mul` | Complex multiplication (real part) |
+| f64x2 add | `f64x2.add` | Amplitude accumulation |
+| f64x2 sub | `f64x2.sub` | Complex multiplication (cross terms) |
+| f64x2 shuffle | `i64x2.shuffle` | Swapping real/imaginary parts |
+| f32x4 multiply | `f32x4.mul` | f32 mode complex multiply |
+| f32x4 fma | emulated | Fused multiply-add for accuracy |
+
+**Conditional compilation**:
+
+```rust
+// In ruqu-core, WASM SIMD path
+#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+mod wasm_simd {
+    use core::arch::wasm32::*;
+
+    /// Apply 2x2 unitary to a pair of amplitudes using WASM SIMD
+    #[inline(always)]
+    pub fn apply_gate_2x2_simd(
+        a_re: f64, a_im: f64,
+        b_re: f64, b_im: f64,
+        u00_re: f64, u00_im: f64,
+        u01_re: f64, u01_im: f64,
+        u10_re: f64, u10_im: f64,
+        u11_re: f64, u11_im: f64,
+    ) -> (f64, f64, f64, f64) {
+        // Pack amplitude pair into SIMD lanes
+        let a = f64x2(a_re, a_im);
+        let b = f64x2(b_re, b_im);
+
+        // Complex multiply-accumulate for output amplitudes
+        // c0 = u00*a + u01*b
+        // c1 = u10*a + u11*b
+        // (expanded for complex arithmetic)
+        // ...
+        todo!()
+    }
+}
+
+// Fallback scalar path
+#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
+mod scalar {
+    // Pure scalar complex arithmetic
+}
+```
+
+**Comparison of SIMD widths across targets**:
+
+```
+Native (AVX-512):  512-bit  =  8 f64  =  4 complex f64 per instruction
+Native (AVX2):     256-bit  =  4 f64  =  2 complex f64 per instruction
+Native (NEON):     128-bit  =  2 f64  =  1 complex f64 per instruction
+WASM SIMD:         128-bit  =  2 f64  =  1 complex f64 per instruction
+```
+
+WASM SIMD matches ARM NEON width but is slower due to JIT overhead. The engine
+uses the same algorithmic structure as the NEON path, adapted for WASM SIMD
+intrinsics.
+
+### 5. No GPU in WASM
+
+GPU acceleration is exclusively available in native builds. The WASM path
+uses CPU-only simulation.
+
+**Rationale**:
+- WebGPU is a separate browser API, not accessible from WASM linear memory
+- Bridging WASM to WebGPU would require complex JavaScript glue code
+- WebGPU compute shader support varies across browsers
+- The performance benefit is uncertain for the 25-qubit WASM ceiling
+
+**Future consideration**: If WebGPU stabilizes and WASM-WebGPU interop matures,
+a `ruqu-webgpu` crate could provide browser-side GPU acceleration. This is out
+of scope for the initial release.
+
+### 6. API Parity
+
+`ruqu-wasm` exposes an API that is functionally identical to `ruqu-core` native.
+The same circuit description produces the same measurement results (within
+floating-point tolerance). Only performance and capacity differ.
+
+**Parity guarantee**:
+
+```
+                    Same Circuit
+                        |
+           +------------+------------+
+           |                         |
+     ruqu-core (native)       ruqu-wasm (browser)
+           |                         |
+    - 30+ qubits              - 25 qubits max
+    - AVX2/AVX-512 SIMD       - WASM SIMD128
+    - Rayon threading          - Optional Web Workers
+    - Optional GPU             - CPU only
+    - ~17.5M gates/sec         - ~5-12M gates/sec
+           |                         |
+           +------------+------------+
+                        |
+                  Same Results
+              (within fp tolerance)
+```
+
+**Verified by**: Shared test suite that runs against both native and WASM targets,
+comparing outputs bitwise (for deterministic operations) or statistically (for
+measurement sampling).
+
+### 7. Module Size Target
+
+Target `.wasm` binary size: **< 2 MB** for the default feature set.
+
+**Size budget**:
+
+| Component | Estimated Size |
+|-----------|---------------|
+| Core simulation engine | ~800 KB |
+| Gate implementations | ~200 KB |
+| Measurement and sampling | ~100 KB |
+| wasm-bindgen glue | ~50 KB |
+| Circuit optimization | ~150 KB |
+| Error handling and validation | ~50 KB |
+| **Total (default features)** | **~1.35 MB** |
+| + noise-model feature | +200 KB |
+| + tensor-network feature | +400 KB |
+| **Total (all features)** | **~1.95 MB** |
+
+**Size reduction techniques**:
+- `opt-level = "z"` for size-optimized compilation
+- LTO (Link-Time Optimization) for dead code elimination
+- `wasm-opt` post-processing pass (binaryen)
+- Feature flags to exclude unused capabilities
+- `panic = "abort"` to eliminate unwinding machinery
+- Avoid `format!` and `std::fmt` where possible in hot paths
+
+**Build pipeline**:
+
+```bash
+# Build with wasm-pack
+wasm-pack build crates/ruqu-wasm --target web --release
+
+# Post-process with wasm-opt for additional size reduction
+wasm-opt -Oz --enable-simd \
+    crates/ruqu-wasm/pkg/ruqu_wasm_bg.wasm \
+    -o crates/ruqu-wasm/pkg/ruqu_wasm_bg.wasm
+
+# Verify size
+ls -lh crates/ruqu-wasm/pkg/ruqu_wasm_bg.wasm
+# Expected: < 2 MB
+```
+
+### 8. Future: wasm64 (Memory64 Proposal)
+
+The WebAssembly Memory64 proposal extends the address space to 64 bits,
+removing the 4 GB limitation. When this proposal reaches broad runtime support:
+
+- Recompile `ruqu-wasm` targeting `wasm64-unknown-unknown`
+- Lift the 25-qubit ceiling to match native limits
+- Maintain backward compatibility with wasm32 via conditional compilation
+
+**Current status**: Memory64 is at Phase 4 (standardized) in the WASM
+specification process. Browser support is emerging but not yet universal.
+
+**Migration path**:
+
+```toml
+# Future Cargo.toml
+[features]
+wasm64 = []  # Enable when targeting wasm64
+
+# In code
+#[cfg(feature = "wasm64")]
+const MAX_QUBITS_WASM: usize = 30;
+
+#[cfg(not(feature = "wasm64"))]
+const MAX_QUBITS_WASM: usize = 25;
+```
+
+## Trade-offs Accepted
+
+| Trade-off | Accepted Limitation | Justification |
+|-----------|---------------------|---------------|
+| Performance | ~1.5-3x slower than native | Universal deployment outweighs raw speed |
+| Qubit ceiling | 25 qubits in WASM vs 30+ native | Sufficient for most educational and research workloads |
+| Threading | Requires specific browser headers | Graceful fallback ensures always-works baseline |
+| No GPU | CPU-only in browser | GPU simulation at 25 qubits shows minimal benefit |
+| Binary size | ~1.35 MB module | Acceptable for a quantum simulation library |
+
+## Consequences
+
+### Positive
+
+- **Universal deployment**: Any modern browser or WASM runtime can execute
+  quantum simulations without installation
+- **Security sandboxing**: WASM's memory isolation prevents quantum simulation
+  code from accessing host resources
+- **Edge-aligned**: Matches ruVector's philosophy of computation at the edge
+- **Testable**: WASM builds can be tested in CI via headless browsers and
+  wasm-bindgen-test
+- **Progressive enhancement**: Single-threaded baseline with optional threading
+  ensures broad compatibility
+
+### Negative
+
+- **Performance ceiling**: JIT overhead and narrower SIMD limit throughput
+- **Memory limits**: 25-qubit hard ceiling until wasm64 adoption
+- **Threading complexity**: SharedArrayBuffer requirement adds deployment
+  configuration burden
+- **Debugging difficulty**: WASM debugging tools are less mature than native
+  debuggers
+
+### Mitigations
+
+| Issue | Mitigation |
+|-------|------------|
+| Performance gap | Document native vs WASM trade-offs; recommend native for >20 qubits |
+| Memory exhaustion | Hard limit enforcement with informative error messages |
+| Threading failures | Automatic fallback to single-threaded; no silent degradation |
+| Debug difficulty | Source maps via wasm-pack; comprehensive logging to console |
+| Binary size creep | CI size gate: fail build if .wasm exceeds 2 MB |
+
+## References
+
+- [ADR-QE-001: Quantum Engine Core Architecture](./ADR-QE-001-quantum-engine-core-architecture.md)
+- [ADR-QE-002: Crate Structure & Integration](./ADR-QE-002-crate-structure-integration.md)
+- [ADR-QE-004: Performance Optimization & Benchmarks](./ADR-QE-004-performance-optimization-benchmarks.md)
+- [ADR-005: WASM Runtime Integration](/docs/adr/ADR-005-wasm-runtime-integration.md)
+- [ruvector-router-wasm crate](/crates/ruvector-router-wasm/)
+- [WebAssembly SIMD Proposal](https://github.com/WebAssembly/simd)
+- [WebAssembly Memory64 Proposal](https://github.com/WebAssembly/memory64)
+- [wasm-bindgen-rayon](https://github.com/RReverser/wasm-bindgen-rayon)
+- [Cross-Origin Isolation Guide (MDN)](https://developer.mozilla.org/en-US/docs/Web/API/crossOriginIsolated)
diff --git a/docs/adr/quantum-engine/ADR-QE-004-performance-optimization-benchmarks.md b/docs/adr/quantum-engine/ADR-QE-004-performance-optimization-benchmarks.md
new file mode 100644
index 00000000..562f8b2c
--- /dev/null
+++ b/docs/adr/quantum-engine/ADR-QE-004-performance-optimization-benchmarks.md
@@ -0,0 +1,564 @@
+# ADR-QE-004: Performance Optimization & Benchmarks
+
+**Status**: Proposed
+**Date**: 2026-02-06
+**Authors**: ruv.io, RuVector Team
+**Deciders**: Architecture Review Board
+
+## Context
+
+### Problem Statement
+
+Quantum state-vector simulation is computationally expensive. Every gate
+application touches the full amplitude vector of 2^n complex numbers, making
+gate application O(2^n) per gate for n qubits. For the quantum engine to be
+practical on edge devices and in browser environments, it must achieve
+competitive performance: millions of gates per second for small circuits,
+interactive latency for 10-20 qubit workloads, and the ability to handle
+moderately deep circuits (thousands of gates) without unacceptable delays.
+
+### Computational Cost Model
+
+For a circuit with n qubits, g gates, and s measurement shots:
+
+```
+Total operations (approximate):
+
+  Single-qubit gate:   2^n complex multiplications + 2^n complex additions
+  Two-qubit gate:      2^(n+1) complex multiplications + 2^(n+1) complex additions
+  Measurement (1 shot): 2^n probability calculations + sampling
+  Full circuit:        sum_i(cost(gate_i)) + s * 2^n
+
+  Example: 20-qubit circuit, 500 gates, 1024 shots
+    Gate cost:  500 * 2^20 * ~4 FLOP = ~2.1 billion FLOP
+    Measure:    1024 * 2^20 * ~2 FLOP = ~2.1 billion FLOP
+    Total:      ~4.2 billion FLOP
+```
+
+At 10 GFLOP/s (realistic single-core throughput), this is ~420 ms. With SIMD
+and multi-threading, we target 10-50x improvement.
+
+### Performance Baseline from Comparable Systems
+
+| Simulator | Language | 20-qubit H gate | Notes |
+|-----------|----------|-----------------|-------|
+| Qiskit Aer | C++/Python | ~50 ns | Heavily optimized, OpenMP |
+| Cirq | Python/C++ | ~200 ns | Google, less optimized |
+| QuantRS2 | Rust | ~57 ns | Rust-native, AVX2 |
+| Quest | C | ~40 ns | GPU-capable, highly tuned |
+| Target (ruQu) | Rust | < 60 ns | Competitive with QuantRS2 |
+
+These benchmarks measure per-gate time on a single-qubit Hadamard applied to
+a 20-qubit state vector. Our target is to match or beat QuantRS2, the closest
+comparable pure-Rust implementation.
+
+## Decision
+
+Implement a **multi-layered optimization strategy** with six complementary
+techniques, each addressing a different performance bottleneck.
+
+### Layer 1: SIMD Operations
+
+Use `ruvector-math` SIMD utilities to vectorize amplitude manipulation.
+Gate application fundamentally involves applying a 2x2 or 4x4 unitary matrix
+to pairs/quadruples of complex amplitudes. SIMD processes multiple amplitude
+components simultaneously.
+
+**Native SIMD dispatch**:
+
+```
+Architecture     Instruction Set     Complex f64 per Cycle
+-----------      ---------------     ---------------------
+x86_64           AVX-512             4 (512-bit / 128-bit per complex)
+x86_64           AVX2                2 (256-bit / 128-bit per complex)
+ARM64            NEON                1 (128-bit / 128-bit per complex)
+WASM             SIMD128             1 (128-bit / 128-bit per complex)
+Fallback         Scalar              1 (sequential)
+```
+
+**Single-qubit gate application with AVX2**:
+
+```
+For each pair of amplitudes (a[i], a[i + 2^target]):
+
+  Load:  a_re, a_im = load_f64x4([a[i].re, a[i].im, a[i+step].re, a[i+step].im])
+
+  Compute c0 = u00 * a + u01 * b:
+    mul_re = u00_re * a_re - u00_im * a_im + u01_re * b_re - u01_im * b_im
+    mul_im = u00_re * a_im + u00_im * a_re + u01_re * b_im + u01_im * b_re
+
+  Compute c1 = u10 * a + u11 * b:
+    (analogous)
+
+  Store: [c0.re, c0.im, c1.re, c1.im]
+```
+
+With AVX2 (256-bit), we process 2 complex f64 values per instruction,
+yielding a theoretical 2x speedup over scalar. With AVX-512, this doubles to
+4x. Practical speedup is 1.5-3.5x due to instruction latency and memory
+bandwidth.
+
+**Target per-gate throughput**:
+
+| Qubits | Amplitudes | AVX2 (est.) | AVX-512 (est.) | WASM SIMD (est.) |
+|--------|------------|-------------|----------------|-------------------|
+| 10 | 1,024 | ~15 ns | ~10 ns | ~30 ns |
+| 15 | 32,768 | ~1 us | ~0.5 us | ~2 us |
+| 20 | 1,048,576 | ~50 us | ~25 us | ~100 us |
+| 25 | 33,554,432 | ~1.5 ms | ~0.8 ms | ~3 ms |
+
+### Layer 2: Multithreading
+
+Rayon-based data parallelism splits the state vector across CPU cores for
+gate application. Each thread processes an independent contiguous block of
+amplitudes.
+
+**Parallelization strategy**:
+
+```
+State vector: [amp_0, amp_1, ..., amp_{2^n - 1}]
+
+Thread 0:  [amp_0          ... amp_{2^n/T - 1}]
+Thread 1:  [amp_{2^n/T}    ... amp_{2*2^n/T - 1}]
+  ...
+Thread T-1:[amp_{(T-1)*2^n/T} ... amp_{2^n - 1}]
+
+Where T = number of threads (Rayon work-stealing pool)
+```
+
+**Gate application requires care with target qubit position**:
+
+- If `target < log2(chunk_size)`: each chunk contains complete amplitude pairs.
+  Threads are fully independent. No synchronization needed.
+- If `target >= log2(chunk_size)`: amplitude pairs span chunk boundaries.
+  Must adjust chunk boundaries to align with gate structure.
+
+**Expected scaling**:
+
+```
+Qubits    Amps         1 thread    8 threads    Speedup
+------    ----         --------    ---------    -------
+15        32K          1 us        ~200 ns      ~5x
+20        1M           50 us       ~8 us        ~6x
+22        4M           200 us      ~30 us       ~6.5x
+24        16M          800 us      ~120 us      ~6.7x
+25        32M          1.5 ms      ~220 us      ~6.8x
+```
+
+Speedup plateaus below linear (8x for 8 threads) due to memory bandwidth
+saturation. At 24+ qubits, the state vector exceeds L3 cache and performance
+becomes memory-bound.
+
+**Parallelism threshold**: Do not parallelize below 14 qubits (16K amplitudes).
+The overhead of Rayon's work-stealing exceeds the benefit for small states.
+
+### Layer 3: Gate Fusion
+
+Preprocess circuits to combine consecutive gates into single matrix
+operations, reducing the number of state vector passes.
+
+**Fusion rules**:
+
+```
+Rule 1: Consecutive single-qubit gates on the same qubit
+  Rz(a) -> Rx(b) -> Rz(c)  ==>  U3(a, b, c)  [single matrix multiply]
+
+Rule 2: Consecutive two-qubit gates on the same pair
+  CNOT(0,1) -> CZ(0,1)  ==>  Fused_2Q(0,1)  [4x4 matrix]
+
+Rule 3: Single-qubit gate followed by controlled gate
+  H(0) -> CNOT(0,1)  ==>  Fused operation (absorb H into CNOT matrix)
+
+Rule 4: Identity cancellation
+  H -> H  ==>  Identity (remove both)
+  X -> X  ==>  Identity
+  S -> S_dag  ==>  Identity
+  CNOT -> CNOT (same control/target)  ==>  Identity
+```
+
+**Fusion effectiveness by algorithm**:
+
+| Algorithm | Typical Fusion Ratio | Gate Reduction |
+|-----------|----------------------|----------------|
+| VQE (UCCSD ansatz) | 1.8-2.5x | 30-50% fewer state passes |
+| Grover's | 1.2-1.5x | 15-25% |
+| QAOA | 1.5-2.0x | 25-40% |
+| QFT | 2.0-3.0x | 40-60% |
+| Random circuit | 1.1-1.3x | 5-15% |
+
+**Implementation**:
+
+```rust
+pub struct FusionPass;
+
+impl CircuitOptimizer for FusionPass {
+    fn optimize(&self, circuit: &mut QuantumCircuit) {
+        let mut i = 0;
+        while i < circuit.gates.len() - 1 {
+            let current = &circuit.gates[i];
+            let next = &circuit.gates[i + 1];
+
+            if can_fuse(current, next) {
+                let fused = compute_fused_matrix(current, next);
+                circuit.gates[i] = fused;
+                circuit.gates.remove(i + 1);
+                // Don't advance i; check if we can fuse again
+            } else {
+                i += 1;
+            }
+        }
+    }
+}
+```
+
+### Layer 4: Entanglement-Aware Splitting
+
+Track which qubits have interacted via entangling gates. Simulate independent
+qubit subsets as separate, smaller state vectors. Merge subsets when an
+entangling gate connects them.
+
+**Concept**:
+
+```
+Circuit: q0 --[H]--[CNOT(0,1)]--[Rz]--
+         q1 --[H]--[CNOT(0,1)]--[Ry]--
+         q2 --[H]--[X]---------[Rz]---[CNOT(2,0)]--
+         q3 --[H]--[Y]---------[Rx]--
+
+Initially: {q0}, {q1}, {q2}, {q3}  -- four 2^1 vectors (2 amps each)
+After CNOT(0,1): {q0,q1}, {q2}, {q3}  -- one 2^2 + two 2^1 vectors
+After CNOT(2,0): {q0,q1,q2}, {q3}  -- one 2^3 + one 2^1 vector
+
+Memory: 8 + 2 = 10 amplitudes  vs  2^4 = 16 amplitudes (full)
+```
+
+**Savings scale dramatically for circuits with late entanglement**:
+
+```
+Scenario: 20-qubit circuit, first 100 gates are local, then entangling
+
+Without splitting: 2^20 = 1M amplitudes from gate 1
+With splitting:    20 * 2^1 = 40 amplitudes until first entangling gate
+                   Progressively merge as entanglement grows
+```
+
+**Data structure**:
+
+```rust
+pub struct SplitState {
+    /// Each subset: (qubit indices, state vector)
+    subsets: Vec<(Vec<usize>, QuantumState)>,
+    /// Union-Find structure for tracking connectivity
+    connectivity: UnionFind,
+}
+
+impl SplitState {
+    pub fn apply_gate(&mut self, gate: &Gate, targets: &[usize]) {
+        if gate.is_entangling() {
+            // Merge subsets containing target qubits
+            let merged = self.merge_subsets(targets);
+            // Apply gate to merged state
+            merged.apply_gate(gate, targets);
+        } else {
+            // Apply to the subset containing the target qubit
+            let subset = self.find_subset(targets[0]);
+            subset.apply_gate(gate, targets);
+        }
+    }
+}
+```
+
+**When splitting helps vs. hurts**:
+
+| Circuit Type | Splitting Benefit |
+|-------------|-------------------|
+| Shallow QAOA (p=1-3) | High (qubits entangle gradually) |
+| VQE with local ansatz | High (many local rotations) |
+| Grover's (full oracle) | Low (oracle entangles all qubits early) |
+| QFT | Low (all-to-all entanglement) |
+| Random circuits | Low (entangles quickly) |
+
+The engine automatically disables splitting when all qubits are connected,
+falling back to full state-vector simulation with zero overhead.
+
+### Layer 5: Cache-Local Processing
+
+For large state vectors (>20 qubits), cache utilization becomes critical.
+The state vector exceeds L2 cache (typically 256 KB - 1 MB) and potentially
+L3 cache (8-32 MB).
+
+**Cache analysis**:
+
+```
+Qubits    State Size     L2 (512KB)    L3 (16MB)
+------    ----------     ----------    ---------
+18        4 MB           8x oversize   in cache
+20        16 MB          32x           in cache
+22        64 MB          128x          4x oversize
+24        256 MB         512x          16x oversize
+25        512 MB         1024x         32x oversize
+```
+
+**Techniques**:
+
+1. **Aligned allocation**: State vector aligned to cache line boundaries (64
+   bytes) for optimal prefetch behavior. Uses `ruvector-math` aligned allocator.
+
+2. **Blocking/tiling**: For gates on high-index qubits, the stride between
+   amplitude pairs is large (2^target). Tiling the access pattern to process
+   cache-line-sized blocks sequentially improves spatial locality.
+
+   ```
+   Without tiling (target qubit = 20):
+     Access pattern: amp[0], amp[1M], amp[1], amp[1M+1], ...
+     Cache misses: ~every access (stride = 16 MB)
+
+   With tiling (block size = L2/4):
+     Process block [0..64K], then [64K..128K], ...
+     Cache misses: ~1 per block (sequential within block)
+   ```
+
+3. **Prefetch hints**: Insert software prefetch instructions for the next block
+   of amplitudes while processing the current block.
+
+   ```rust
+   // Prefetch next cache line while processing current
+   #[cfg(target_arch = "x86_64")]
+   unsafe {
+       core::arch::x86_64::_mm_prefetch(
+           state.as_ptr().add(i + CACHE_LINE_AMPS) as *const i8,
+           core::arch::x86_64::_MM_HINT_T0,
+       );
+   }
+   ```
+
+### Layer 6: Lazy Evaluation
+
+Accumulate commuting rotations and defer their application until a
+non-commuting gate appears. This reduces the number of full state-vector
+passes for rotation-heavy circuits common in variational algorithms.
+
+**Commutation rules**:
+
+```
+Rz(a) commutes with Rz(b)  =>  Rz(a+b)
+Rx(a) commutes with Rx(b)  =>  Rx(a+b)
+Rz commutes with CZ        =>  Defer Rz
+Diagonal gates commute      =>  Combine phases
+
+But:
+Rz does NOT commute with H
+Rx does NOT commute with CNOT (on target)
+```
+
+**Implementation sketch**:
+
+```rust
+pub struct LazyAccumulator {
+    /// Pending rotations per qubit: (axis, total_angle)
+    pending: HashMap<usize, Vec<(RotationAxis, f64)>>,
+}
+
+impl LazyAccumulator {
+    pub fn push_gate(&mut self, gate: &Gate, target: usize) -> Option<FlushedGate> {
+        if let Some(rotation) = gate.as_rotation() {
+            if let Some(existing) = self.pending.get_mut(&target) {
+                if existing.last().map_or(false, |(axis, _)| *axis == rotation.axis) {
+                    // Same axis: accumulate angle
+                    existing.last_mut().unwrap().1 += rotation.angle;
+                    return None; // No gate emitted
+                }
+            }
+            self.pending.entry(target).or_default().push((rotation.axis, rotation.angle));
+            None
+        } else {
+            // Non-commuting gate: flush pending rotations for affected qubits
+            let flushed = self.flush(target);
+            Some(flushed)
+        }
+    }
+}
+```
+
+**Effectiveness**: VQE circuits with alternating Rz-Rx-Rz layers see 20-40%
+reduction in state-vector passes. QAOA circuits with repeated ZZ-rotation
+layers see 15-30% reduction.
+
+## Benchmark Targets
+
+### Primary Benchmark Suite
+
+| ID | Workload | Qubits | Gates | Target Time | Notes |
+|----|----------|--------|-------|-------------|-------|
+| B1 | Grover (8 qubits) | 8 | ~200 | < 1 ms | 3 Grover iterations |
+| B2 | Grover (16 qubits) | 16 | ~3,000 | < 10 ms | ~64 iterations |
+| B3 | VQE iteration (12 qubits) | 12 | ~120 | < 5 ms | Single parameter update |
+| B4 | VQE iteration (20 qubits) | 20 | ~300 | < 50 ms | UCCSD ansatz |
+| B5 | QAOA p=3 (10 nodes) | 10 | ~75 | < 1 ms | MaxCut on random graph |
+| B6 | QAOA p=5 (20 nodes) | 20 | ~200 | < 200 ms | MaxCut on random graph |
+| B7 | Surface code cycle (d=3) | 17 | ~20 | < 10 ms | Single syndrome round |
+| B8 | 1000 surface code cycles | 17 | ~20,000 | < 2 s | Repeated error correction |
+| B9 | QFT (20 qubits) | 20 | ~210 | < 30 ms | Full quantum Fourier transform |
+| B10 | Random circuit (25 qubits) | 25 | 100 | < 10 s | Worst-case memory test |
+
+### Micro-Benchmarks
+
+Per-gate timing for individual operations:
+
+| Gate | 10 qubits | 15 qubits | 20 qubits | 25 qubits |
+|------|-----------|-----------|-----------|-----------|
+| H | < 20 ns | < 0.5 us | < 50 us | < 1.5 ms |
+| CNOT | < 30 ns | < 1 us | < 80 us | < 2.5 ms |
+| Rz(theta) | < 15 ns | < 0.4 us | < 40 us | < 1.2 ms |
+| Toffoli | < 50 ns | < 1.5 us | < 120 us | < 4 ms |
+| Measure | < 10 ns | < 0.3 us | < 30 us | < 1 ms |
+
+### WASM-Specific Benchmarks
+
+| ID | Workload | Qubits | Target (WASM) | Target (Native) | Expected Ratio |
+|----|----------|--------|---------------|-----------------|----------------|
+| W1 | Grover (8) | 8 | < 3 ms | < 1 ms | ~3x |
+| W2 | VQE iter (12) | 12 | < 12 ms | < 5 ms | ~2.5x |
+| W3 | QAOA p=3 (10) | 10 | < 2.5 ms | < 1 ms | ~2.5x |
+| W4 | Random (20) | 20 | < 500 ms | < 200 ms | ~2.5x |
+| W5 | Random (25) | 25 | < 25 s | < 10 s | ~2.5x |
+
+### Benchmark Infrastructure
+
+Benchmarks use Criterion.rs for native and a custom timing harness for WASM:
+
+```rust
+// Native benchmarks (Criterion)
+use criterion::{criterion_group, criterion_main, Criterion};
+
+fn bench_grover_8(c: &mut Criterion) {
+    c.bench_function("grover_8_qubits", |b| {
+        b.iter(|| {
+            let mut state = QuantumState::new(8).unwrap();
+            let circuit = grover_circuit(8, &target_state);
+            state.execute(&circuit)
+        })
+    });
+}
+
+fn bench_single_gate_scaling(c: &mut Criterion) {
+    let mut group = c.benchmark_group("hadamard_scaling");
+    for n in [10, 12, 14, 16, 18, 20, 22, 24] {
+        group.bench_with_input(
+            BenchmarkId::from_parameter(n),
+            &n,
+            |b, &n| {
+                let mut state = QuantumState::new(n).unwrap();
+                let mut circuit = QuantumCircuit::new(n).unwrap();
+                circuit.gate(Gate::H, &[0]);
+                b.iter(|| state.execute(&circuit))
+            },
+        );
+    }
+    group.finish();
+}
+
+criterion_group!(benches, bench_grover_8, bench_single_gate_scaling);
+criterion_main!(benches);
+```
+
+**WASM benchmark harness**:
+
+```javascript
+// Browser-based benchmark using performance.now()
+async function benchmarkGrover8() {
+    const { QuantumCircuit, QuantumState } = await import('./ruqu_wasm.js');
+
+    const iterations = 100;
+    const start = performance.now();
+
+    for (let i = 0; i < iterations; i++) {
+        const circuit = QuantumCircuit.grover(8, 42);
+        const state = new QuantumState(8);
+        state.execute(circuit);
+        state.free();
+        circuit.free();
+    }
+
+    const elapsed = performance.now() - start;
+    console.log(`Grover 8-qubit: ${(elapsed / iterations).toFixed(3)} ms/iteration`);
+}
+```
+
+### Performance Regression Detection
+
+CI runs benchmark suite on every PR. Regressions exceeding 10% trigger a
+warning; regressions exceeding 25% block the merge.
+
+```yaml
+# In CI pipeline
+- name: Run benchmarks
+  run: |
+    cargo bench --package ruqu-core -- --save-baseline pr
+    cargo bench --package ruqu-core -- --baseline main --load-baseline pr
+    # critcmp compares and flags regressions
+    critcmp main pr --threshold 10
+```
+
+### Optimization Priority Matrix
+
+Not all optimizations apply equally to all workloads. The priority matrix
+guides implementation order:
+
+| Optimization | Impact (small circuits) | Impact (large circuits) | Impl Effort | Priority |
+|-------------|------------------------|------------------------|-------------|----------|
+| SIMD | Medium (1.5-2x) | High (2-3.5x) | Medium | P0 |
+| Multithreading | Low (overhead > benefit) | High (5-7x) | Medium | P1 |
+| Gate fusion | High (30-50% fewer passes) | Medium (15-30%) | Low | P0 |
+| Entanglement splitting | Variable (0-100x) | Low (quickly entangled) | High | P2 |
+| Cache tiling | Low (fits in cache) | High (2-4x) | Medium | P1 |
+| Lazy evaluation | Medium (20-40%) | Low (10-20%) | Low | P2 |
+
+**Implementation order**: SIMD -> Gate Fusion -> Multithreading -> Cache Tiling
+-> Lazy Evaluation -> Entanglement Splitting
+
+## Consequences
+
+### Positive
+
+- **Competitive performance**: Multi-layered approach targets performance
+  parity with state-of-the-art Rust simulators (QuantRS2)
+- **Interactive latency**: Most practical workloads (8-20 qubits) complete
+  in single-digit milliseconds, enabling real-time experimentation
+- **Scalable**: Each optimization layer addresses a different bottleneck,
+  providing compounding benefits
+- **Measurable**: Concrete benchmark targets enable objective progress tracking
+  and regression detection
+
+### Negative
+
+- **Optimization complexity**: Six optimization layers create significant
+  implementation and maintenance complexity
+- **Ongoing tuning**: Performance characteristics vary across hardware;
+  benchmarks must cover representative platforms
+- **Diminishing returns**: For >20 qubits, memory bandwidth dominates and
+  compute optimizations yield marginal gains
+- **Testing burden**: Each optimization must be validated for numerical
+  correctness across all gate types
+
+### Risks and Mitigations
+
+| Risk | Likelihood | Impact | Mitigation |
+|------|-----------|--------|------------|
+| Memory bandwidth bottleneck at >20 qubits | High | Medium | Document expected scaling; recommend native for large circuits |
+| Gate fusion introducing numerical error | Low | High | Comprehensive numerical tests comparing fused vs. unfused results |
+| Entanglement tracking overhead exceeding savings | Medium | Low | Automatic disable when all qubits connected within first 10 gates |
+| WASM SIMD not available in target runtime | Low | Medium | Graceful fallback to scalar; runtime feature detection |
+| Benchmark targets too aggressive for edge hardware | Medium | Low | Separate targets for edge (Cognitum) vs. desktop; scale expectations |
+
+## References
+
+- [ADR-QE-001: Quantum Engine Core Architecture](./ADR-QE-001-quantum-engine-core-architecture.md)
+- [ADR-QE-002: Crate Structure & Integration](./ADR-QE-002-crate-structure-integration.md)
+- [ADR-QE-003: WASM Compilation Strategy](./ADR-QE-003-wasm-compilation-strategy.md)
+- [ADR-003: SIMD Optimization Strategy](/docs/adr/ADR-003-simd-optimization-strategy.md)
+- [ruvector-math crate](/crates/ruvector-math/)
+- Guerreschi & Hogaboam, "Intel Quantum Simulator: A cloud-ready high-performance
+  simulator of quantum circuits" (2020)
+- Jones et al., "QuEST and High Performance Simulation of Quantum Computers" (2019)
+- QuantRS2 benchmark data (internal comparison)
diff --git a/docs/adr/quantum-engine/ADR-QE-005-vqe-algorithm-support.md b/docs/adr/quantum-engine/ADR-QE-005-vqe-algorithm-support.md
new file mode 100644
index 00000000..6372eaf5
--- /dev/null
+++ b/docs/adr/quantum-engine/ADR-QE-005-vqe-algorithm-support.md
@@ -0,0 +1,650 @@
+# ADR-QE-005: Variational Quantum Eigensolver (VQE) Support
+
+**Status**: Proposed
+**Date**: 2026-02-06
+**Authors**: ruv.io, RuVector Team
+**Deciders**: Architecture Review Board
+
+## Version History
+
+| Version | Date | Author | Changes |
+|---------|------|--------|---------|
+| 0.1 | 2026-02-06 | ruv.io | Initial VQE architecture proposal |
+
+---
+
+## Context
+
+### The Variational Quantum Eigensolver Problem
+
+The Variational Quantum Eigensolver (VQE) is one of the most important near-term quantum
+algorithms, with direct applications in computational chemistry, materials science, and
+combinatorial optimization. VQE computes ground-state energies of molecular Hamiltonians
+by variationally minimizing the expectation value of a Hamiltonian operator with respect
+to a parameterized quantum state (ansatz).
+
+### Why VQE Matters for ruQu
+
+VQE sits at the intersection of quantum simulation and classical optimization, making it
+a natural fit for ruQu's hybrid classical-quantum architecture:
+
+1. **Chemistry applications**: Drug discovery, catalyst design, battery materials
+2. **Optimization**: QUBO problems, portfolio optimization, logistics
+3. **Benchmarking**: VQE circuits exercise the full gate set and serve as a representative
+   workload for evaluating simulator performance
+4. **Agent integration**: ruVector agents can autonomously explore chemical configuration
+   spaces using VQE as the inner evaluation kernel
+
+### Core Requirements
+
+| Requirement | Description | Priority |
+|-------------|-------------|----------|
+| Parameterized circuits | Symbolic gate angles resolved at evaluation time | P0 |
+| Hamiltonian decomposition | Represent H as sum of weighted Pauli strings | P0 |
+| Exact expectation values | Direct state vector computation (no shot noise) | P0 |
+| Gradient evaluation | Parameter-shift rule for classical optimizer | P0 |
+| Shot-based sampling | Optional mode for hardware noise emulation | P1 |
+| Classical optimizer interface | Trait-based abstraction for multiple optimizers | P1 |
+| Hardware-efficient ansatz | Pre-built ansatz library for common topologies | P2 |
+
+### Current Limitations
+
+Without dedicated VQE support, users must manually:
+- Construct parameterized circuits with explicit angle substitution per iteration
+- Decompose Hamiltonians into individual Pauli measurements
+- Implement gradient computation by duplicating circuit evaluations
+- Wire up classical optimizers with no standard interface
+
+This is error-prone and leaves significant performance on the table, since a state vector
+simulator can compute exact expectation values in a single pass without sampling overhead.
+
+---
+
+## Decision
+
+### 1. Parameterized Gate Architecture
+
+Circuits accept symbolic parameters that are resolved to numeric values per evaluation.
+This avoids circuit reconstruction on each VQE iteration.
+
+```
+                ┌──────────────────────────────────────────────────┐
+                │            Parameterized Circuit                  │
+                │                                                    │
+                │  ┌─────┐  ┌──────────┐  ┌─────┐  ┌──────────┐  │
+   |0> ─────────┤  │  H  ├──┤ Ry(θ[0]) ├──┤ CX  ├──┤ Rz(θ[2]) ├──┤───
+                │  └─────┘  └──────────┘  └──┬──┘  └──────────┘  │
+                │                             │                     │
+   |0> ─────────┤──────────────────────────────●───── Ry(θ[1]) ────┤───
+                │                                                    │
+                └──────────────────────────────────────────────────┘
+                                      │
+                                      ▼
+                          parameters: [θ[0], θ[1], θ[2]]
+                          values:     [0.54, 1.23, -0.87]
+```
+
+**Data model**:
+
+```rust
+/// A symbolic parameter in a quantum circuit.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct Parameter {
+    pub name: String,
+    pub index: usize,
+}
+
+/// A gate that may reference symbolic parameters.
+pub enum ParameterizedGate {
+    /// Fixed gate (no parameters)
+    Fixed(Gate),
+    /// Rotation gate with a symbolic angle
+    Rx(ParameterExpr),
+    Ry(ParameterExpr),
+    Rz(ParameterExpr),
+    /// Parameterized two-qubit gate
+    Rzz(ParameterExpr, Qubit, Qubit),
+}
+
+/// Expression for a gate parameter (supports linear combinations).
+pub enum ParameterExpr {
+    /// Direct parameter reference: θ[i]
+    Param(usize),
+    /// Scaled parameter: c * θ[i]
+    Scaled(f64, usize),
+    /// Sum of expressions
+    Sum(Box<ParameterExpr>, Box<ParameterExpr>),
+    /// Constant value
+    Constant(f64),
+}
+```
+
+**Resolution**: When `evaluate(params: &[f64])` is called, each `ParameterExpr` is resolved
+to a concrete `f64`, and the corresponding unitary matrix is computed. This happens once per
+VQE iteration and is negligible compared to state vector manipulation.
+
+### 2. Hamiltonian Representation
+
+The Hamiltonian is represented as a sum of weighted Pauli strings:
+
+```
+H = c_0 * I + c_1 * Z_0 + c_2 * Z_1 + c_3 * Z_0 Z_1 + c_4 * X_0 X_1 + ...
+```
+
+where each term is a tensor product of single-qubit Pauli operators {I, X, Y, Z}.
+
+```rust
+/// A single Pauli operator on one qubit.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Pauli {
+    I,
+    X,
+    Y,
+    Z,
+}
+
+/// A Pauli string: tensor product of single-qubit Paulis.
+/// Stored as a compact bitfield for n-qubit systems.
+///
+/// Encoding: 2 bits per qubit (00=I, 01=X, 10=Y, 11=Z)
+/// For n <= 32 qubits, fits in a single u64.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct PauliString {
+    /// Packed Pauli operators (2 bits each)
+    pub ops: Vec<u64>,
+    /// Number of qubits
+    pub n_qubits: usize,
+}
+
+/// A Hamiltonian as a sum of weighted Pauli strings.
+///
+/// H = sum_j c_j P_j
+pub struct PauliSum {
+    /// Terms: (coefficient, Pauli string)
+    pub terms: Vec<(Complex64, PauliString)>,
+    /// Number of qubits
+    pub n_qubits: usize,
+}
+```
+
+**Optimization**: Identity terms (all-I Pauli strings) contribute a constant energy offset
+and require no state vector computation. The implementation detects and separates these
+before the expectation loop.
+
+### 3. Direct Expectation Value Computation
+
+This is the critical performance advantage of state vector simulation over real hardware.
+On physical quantum computers, expectation values must be estimated via repeated
+measurement (shot-based sampling), requiring O(1/epsilon^2) shots for epsilon precision.
+
+In a state vector simulator, we compute the **exact** expectation value:
+
+```
+<psi| H |psi> = sum_j c_j * <psi| P_j |psi>
+```
+
+For each Pauli string P_j, the expectation value is:
+
+```
+<psi| P_j |psi> = sum_k psi_k* (P_j |psi>)_k
+```
+
+Since P_j is a tensor product of single-qubit Paulis, its action on a basis state |k> is:
+- I: |k> -> |k>
+- X: flips qubit, no phase
+- Y: flips qubit, phase factor +/- i
+- Z: no flip, phase factor +/- 1
+
+This means each Pauli string maps each basis state to exactly one other basis state with
+a phase factor. The expectation value reduces to a sum over 2^n amplitudes.
+
+```rust
+impl QuantumState {
+    /// Compute the exact expectation value of a PauliSum.
+    ///
+    /// Complexity: O(T * 2^n) where T = number of Pauli terms, n = qubits.
+    /// For a 12-qubit system with 100 Pauli terms:
+    ///   100 * 4096 = 409,600 operations ~ 0.5ms
+    pub fn expectation(&self, hamiltonian: &PauliSum) -> f64 {
+        let mut total = 0.0_f64;
+
+        for (coeff, pauli) in &hamiltonian.terms {
+            let mut term_val = Complex64::zero();
+
+            for k in 0..self.amplitudes.len() {
+                // Compute P_j |k>: determine target index and phase
+                let (target_idx, phase) = pauli.apply_to_basis(k);
+                // <k| P_j |psi> = phase * psi[target_idx]
+                // Accumulate psi[k]* * phase * psi[target_idx]
+                term_val += self.amplitudes[k].conj()
+                    * phase
+                    * self.amplitudes[target_idx];
+            }
+
+            total += (coeff * term_val).re;
+        }
+
+        total
+    }
+}
+```
+
+**Function signature**: `QuantumState::expectation(PauliSum) -> f64`
+
+#### Accuracy Advantage Over Sampling
+
+| Method | Precision | Evaluations | 12-qubit Cost |
+|--------|-----------|-------------|---------------|
+| Shot-based (1000 shots) | ~3% | 1000 circuit runs per term | ~500ms |
+| Shot-based (10000 shots) | ~1% | 10000 circuit runs per term | ~5s |
+| Shot-based (1M shots) | ~0.1% | 1M circuit runs per term | ~500s |
+| **Exact (state vector)** | **Machine epsilon** | **1 pass over state** | **~0.5ms** |
+
+For VQE convergence, exact expectation values eliminate the statistical noise floor that
+plagues hardware-based VQE. Classical optimizers receive clean gradients, leading to:
+- Faster convergence (fewer iterations)
+- No barren plateau artifacts from shot noise
+- Deterministic reproducibility
+
+### 4. Gradient Support via Parameter-Shift Rule
+
+The parameter-shift rule provides exact analytic gradients for parameterized quantum gates.
+For a gate with parameter theta:
+
+```
+d/d(theta) <H> = [<H>(theta + pi/2) - <H>(theta - pi/2)] / 2
+```
+
+This requires two circuit evaluations per parameter per gradient component.
+
+```rust
+/// Compute the gradient of the expectation value with respect to all parameters.
+///
+/// Uses the parameter-shift rule:
+///   grad_i = [E(theta_i + pi/2) - E(theta_i - pi/2)] / 2
+///
+/// Complexity: O(2 * n_params * circuit_eval_cost)
+/// For 12 qubits, 20 parameters, 100 Pauli terms:
+///   2 * 20 * (circuit_sim + expectation) ~ 40 * 1ms = 40ms
+pub fn gradient(
+    circuit: &ParameterizedCircuit,
+    hamiltonian: &PauliSum,
+    params: &[f64],
+) -> Vec<f64> {
+    let n_params = params.len();
+    let mut grad = vec![0.0; n_params];
+    let shift = std::f64::consts::FRAC_PI_2; // pi/2
+
+    for i in 0..n_params {
+        // Forward shift
+        let mut params_plus = params.to_vec();
+        params_plus[i] += shift;
+        let e_plus = evaluate_energy(circuit, hamiltonian, &params_plus);
+
+        // Backward shift
+        let mut params_minus = params.to_vec();
+        params_minus[i] -= shift;
+        let e_minus = evaluate_energy(circuit, hamiltonian, &params_minus);
+
+        grad[i] = (e_plus - e_minus) / 2.0;
+    }
+
+    grad
+}
+```
+
+### 5. Classical Optimizer Interface
+
+A trait-based abstraction supports plugging in different classical optimizers without
+changing the VQE loop:
+
+```rust
+/// Trait for classical optimizers used in the VQE outer loop.
+pub trait ClassicalOptimizer: Send {
+    /// Initialize the optimizer with the parameter count.
+    fn initialize(&mut self, n_params: usize);
+
+    /// Propose next parameter values given current energy and optional gradient.
+    fn step(
+        &mut self,
+        params: &[f64],
+        energy: f64,
+        gradient: Option<&[f64]>,
+    ) -> OptimizerResult;
+
+    /// Check if the optimizer has converged.
+    fn has_converged(&self) -> bool;
+
+    /// Get optimizer name for logging.
+    fn name(&self) -> &str;
+}
+
+/// Result of an optimizer step.
+pub struct OptimizerResult {
+    pub new_params: Vec<f64>,
+    pub converged: bool,
+    pub iteration: usize,
+}
+```
+
+**Provided implementations**:
+
+| Optimizer | Type | Gradient Required | Best For |
+|-----------|------|-------------------|----------|
+| `GradientDescent` | Gradient-based | Yes | Simple landscapes |
+| `Adam` | Adaptive gradient | Yes | Noisy gradients, deep circuits |
+| `LBFGS` | Quasi-Newton | Yes | Smooth landscapes, fast convergence |
+| `COBYLA` | Derivative-free | No | Non-differentiable cost functions |
+| `NelderMead` | Simplex | No | Low-dimensional problems |
+| `SPSA` | Stochastic | No | Shot-based mode, noisy evaluations |
+
+### 6. VQE Iteration Loop
+
+The complete VQE algorithm proceeds as follows:
+
+```
+VQE Iteration Loop
+==================
+
+Input:  Hamiltonian H (PauliSum), Ansatz A (ParameterizedCircuit),
+        Optimizer O (ClassicalOptimizer), initial params theta_0
+
+Output: Minimum energy E_min, optimal params theta_opt
+
+    theta = theta_0
+    O.initialize(len(theta))
+
+    repeat:
+        ┌─────────────────────────────────────────────┐
+        │  1. PREPARE STATE                            │
+        │     |psi(theta)> = A(theta) |0...0>          │
+        │     [Simulate parameterized circuit]          │
+        │     Cost: O(G * 2^n) where G = gate count    │
+        └─────────────────────────────────────────────┘
+                           │
+                           ▼
+        ┌─────────────────────────────────────────────┐
+        │  2. EVALUATE ENERGY                          │
+        │     E = <psi(theta)| H |psi(theta)>          │
+        │     [Direct state vector expectation]         │
+        │     Cost: O(T * 2^n) where T = Pauli terms   │
+        └─────────────────────────────────────────────┘
+                           │
+                           ▼
+        ┌─────────────────────────────────────────────┐
+        │  3. COMPUTE GRADIENT (if optimizer needs it) │
+        │     grad = parameter_shift(A, H, theta)      │
+        │     [2 * n_params circuit evaluations]        │
+        │     Cost: O(2P * (G + T) * 2^n)              │
+        └─────────────────────────────────────────────┘
+                           │
+                           ▼
+        ┌─────────────────────────────────────────────┐
+        │  4. CLASSICAL UPDATE                         │
+        │     theta_new = O.step(theta, E, grad)       │
+        │     [Pure classical computation]              │
+        │     Cost: O(P^2) for quasi-Newton             │
+        └─────────────────────────────────────────────┘
+                           │
+                           ▼
+        ┌─────────────────────────────────────────────┐
+        │  5. CONVERGENCE CHECK                        │
+        │     if |E_new - E_old| < tol: STOP           │
+        │     else: theta = theta_new, continue         │
+        └─────────────────────────────────────────────┘
+
+    return (E_min, theta_opt)
+```
+
+**Pseudocode**:
+
+```rust
+pub fn vqe(
+    ansatz: &ParameterizedCircuit,
+    hamiltonian: &PauliSum,
+    optimizer: &mut dyn ClassicalOptimizer,
+    config: &VqeConfig,
+) -> VqeResult {
+    let n_params = ansatz.parameter_count();
+    let mut params = config.initial_params.clone()
+        .unwrap_or_else(|| vec![0.0; n_params]);
+
+    optimizer.initialize(n_params);
+
+    let mut best_energy = f64::INFINITY;
+    let mut best_params = params.clone();
+    let mut history = Vec::new();
+
+    for iteration in 0..config.max_iterations {
+        // Step 1+2: Simulate circuit and compute energy
+        let state = ansatz.simulate(&params);
+        let energy = state.expectation(hamiltonian);
+
+        // Track best
+        if energy < best_energy {
+            best_energy = energy;
+            best_params = params.clone();
+        }
+
+        // Step 3: Compute gradient if needed
+        let grad = if optimizer.needs_gradient() {
+            Some(gradient(ansatz, hamiltonian, &params))
+        } else {
+            None
+        };
+
+        history.push(VqeIteration { iteration, energy, params: params.clone() });
+
+        // Step 4: Classical update
+        let result = optimizer.step(&params, energy, grad.as_deref());
+        params = result.new_params;
+
+        // Step 5: Convergence check
+        if result.converged || (iteration > 0 &&
+            (history[iteration].energy - history[iteration - 1].energy).abs()
+                < config.convergence_threshold) {
+            break;
+        }
+    }
+
+    VqeResult {
+        energy: best_energy,
+        optimal_params: best_params,
+        iterations: history.len(),
+        history,
+        converged: optimizer.has_converged(),
+    }
+}
+```
+
+### 7. Optional Shot-Based Sampling Mode
+
+For mimicking real hardware behavior and testing noise resilience:
+
+```rust
+/// Configuration for shot-based VQE mode.
+pub struct ShotConfig {
+    /// Number of measurement shots per expectation estimation
+    pub shots: usize,
+    /// Random seed for reproducibility
+    pub seed: Option<u64>,
+    /// Readout error rate (probability of bit flip on measurement)
+    pub readout_error: f64,
+}
+
+impl QuantumState {
+    /// Estimate expectation value via shot-based sampling.
+    ///
+    /// Samples the state `shots` times in the computational basis,
+    /// then computes the empirical expectation of each Pauli term.
+    pub fn expectation_sampled(
+        &self,
+        hamiltonian: &PauliSum,
+        config: &ShotConfig,
+    ) -> (f64, f64) {
+        // Returns (mean, standard_error)
+        // Standard error = std_dev / sqrt(shots)
+        todo!()
+    }
+}
+```
+
+### 8. Hardware-Efficient Ansatz Patterns
+
+Pre-built ansatz constructors for common use cases:
+
+```
+Hardware-Efficient Ansatz (depth d, n qubits):
+
+Layer 1..d:
+  ┌─────┐  ┌──────────┐  ┌──────────┐  ┌──────────┐
+  ┤ Ry  ├──┤  Rz      ├──┤  CNOT    ├──┤  Ry      ├──
+  └─────┘  └──────────┘  │  ladder  │  └──────────┘
+  ┌─────┐  ┌──────────┐  │          │  ┌──────────┐
+  ┤ Ry  ├──┤  Rz      ├──┤          ├──┤  Ry      ├──
+  └─────┘  └──────────┘  └──────────┘  └──────────┘
+
+Parameters per layer: 3n (Ry + Rz + Ry per qubit)
+Total parameters:     3nd
+```
+
+```rust
+/// Pre-built ansatz constructors.
+pub mod ansatz {
+    /// Hardware-efficient ansatz with Ry-Rz layers and linear CNOT entanglement.
+    pub fn hardware_efficient(n_qubits: usize, depth: usize) -> ParameterizedCircuit;
+
+    /// UCCSD (Unitary Coupled Cluster Singles and Doubles) for chemistry.
+    /// Generates excitation operators based on active space.
+    pub fn uccsd(n_electrons: usize, n_orbitals: usize) -> ParameterizedCircuit;
+
+    /// Hamiltonian variational ansatz: layers of exp(-i * theta_j * P_j)
+    /// for each term P_j in the Hamiltonian.
+    pub fn hamiltonian_variational(
+        hamiltonian: &PauliSum,
+        depth: usize,
+    ) -> ParameterizedCircuit;
+
+    /// Symmetry-preserving ansatz that respects particle number conservation.
+    pub fn symmetry_preserving(
+        n_qubits: usize,
+        n_particles: usize,
+        depth: usize,
+    ) -> ParameterizedCircuit;
+}
+```
+
+### 9. Performance Analysis
+
+#### 12-Qubit VQE Performance Estimate
+
+| Component | Operations | Time |
+|-----------|-----------|------|
+| State vector size | 2^12 = 4,096 complex amplitudes | 64 KB |
+| Circuit simulation (50 gates) | 50 * 4096 = 204,800 ops | ~0.3ms |
+| Expectation (100 Pauli terms) | 100 * 4096 = 409,600 ops | ~0.5ms |
+| Gradient (20 params) | 40 * (0.3 + 0.5) ms | ~32ms |
+| Classical optimizer step | O(20^2) | ~0.001ms |
+| **Total per iteration (with gradient)** | | **~33ms** |
+| **Total per iteration (no gradient)** | | **~0.8ms** |
+
+For gradient-free optimizers (COBYLA, Nelder-Mead), a 12-qubit VQE iteration completes
+in under 1ms. With parameter-shift gradients, the cost scales linearly with parameter
+count but remains under 50ms for typical chemistry ansatze.
+
+**Scaling with qubit count**:
+
+| Qubits | State Size | Memory | Energy Eval (100 terms) | Gradient (20 params) |
+|--------|-----------|--------|------------------------|---------------------|
+| 8 | 256 | 4 KB | ~0.03ms | ~2ms |
+| 12 | 4,096 | 64 KB | ~0.5ms | ~33ms |
+| 16 | 65,536 | 1 MB | ~8ms | ~500ms |
+| 20 | 1,048,576 | 16 MB | ~130ms | ~8s |
+| 24 | 16,777,216 | 256 MB | ~2s | ~130s |
+| 28 | 268,435,456 | 4 GB | ~33s | ~35min |
+
+### 10. Integration with ruVector Agent System
+
+ruVector agents can drive autonomous chemistry optimization using VQE as the evaluation
+kernel:
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                  ruVector Agent Orchestration                     │
+│                                                                   │
+│  ┌──────────┐    ┌──────────────┐    ┌────────────────────┐     │
+│  │ Research  │───>│ Architecture │───>│  Chemistry Agent   │     │
+│  │  Agent    │    │    Agent     │    │                    │     │
+│  │           │    │              │    │  - Molecule spec   │     │
+│  │ Literature│    │ Hamiltonian  │    │  - Basis set sel.  │     │
+│  │ search    │    │ generation   │    │  - Active space    │     │
+│  └──────────┘    └──────────────┘    │  - VQE execution   │     │
+│                                       │  - Result analysis │     │
+│                                       └────────┬───────────┘     │
+│                                                │                  │
+│                                       ┌────────▼───────────┐     │
+│                                       │   ruQu VQE Engine  │     │
+│                                       │                    │     │
+│                                       │  Parameterized     │     │
+│                                       │  Circuit + PauliSum│     │
+│                                       │  + Optimizer        │     │
+│                                       └────────────────────┘     │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+The agent workflow:
+1. **Research agent** retrieves molecular structure and prior computational results
+2. **Architecture agent** generates the qubit Hamiltonian (Jordan-Wigner or Bravyi-Kitaev
+   transformation from fermionic operators)
+3. **Chemistry agent** selects ansatz, optimizer, and runs VQE iterations
+4. **Results** are stored in ruVector memory for pattern learning across molecules
+
+---
+
+## Consequences
+
+### Benefits
+
+1. **Exact expectation values** eliminate sampling noise, enabling faster convergence and
+   deterministic reproducibility -- a major advantage over hardware VQE
+2. **Symbolic parameterization** avoids circuit reconstruction overhead, reducing per-iteration
+   cost to pure state manipulation
+3. **Trait-based optimizer interface** allows users to swap optimizers without touching VQE
+   logic, and supports custom optimizer implementations
+4. **Hardware-efficient ansatz library** provides tested, production-quality circuit templates
+   for common use cases
+5. **Gradient support** via parameter-shift rule enables modern gradient-based optimization
+   (Adam, L-BFGS) that converges significantly faster than derivative-free methods
+6. **Agent integration** enables autonomous, memory-enhanced chemistry exploration that
+   learns from prior VQE runs across molecular configurations
+
+### Risks
+
+| Risk | Probability | Impact | Mitigation |
+|------|------------|--------|------------|
+| Exponential memory scaling limits qubit count | High | Medium | Tensor network backend for >30 qubits (future ADR) |
+| Parameter-shift gradient cost scales with parameter count | Medium | Medium | Batched gradient evaluation, simultaneous perturbation (SPSA) fallback |
+| Hamiltonian term count explosion for large molecules | Medium | High | Pauli grouping (qubit-wise commuting), measurement reduction techniques |
+| Optimizer convergence to local minima | Medium | Medium | Multi-start strategies, QAOA-inspired initialization |
+
+### Trade-offs
+
+| Decision | Advantage | Disadvantage |
+|----------|-----------|--------------|
+| Exact expectation over sampling | Machine-precision accuracy | Not representative of real hardware noise |
+| Parameter-shift over finite-difference | Exact gradients | 2x evaluations per parameter |
+| Trait-based optimizer | Extensible | Slight abstraction overhead |
+| Compact PauliString bitfield | Cache-friendly | Complex bit manipulation logic |
+
+---
+
+## References
+
+- Peruzzo, A. et al. "A variational eigenvalue solver on a photonic quantum processor." Nature Communications 5, 4213 (2014)
+- McClean, J.R. et al. "The theory of variational hybrid quantum-classical algorithms." New Journal of Physics 18, 023023 (2016)
+- Kandala, A. et al. "Hardware-efficient variational quantum eigensolver for small molecules." Nature 549, 242-246 (2017)
+- Schuld, M. et al. "Evaluating analytic gradients on quantum hardware." Physical Review A 99, 032331 (2019)
+- ADR-001: ruQu Architecture - Classical Nervous System for Quantum Machines
+- ADR-QE-001 through ADR-QE-004: Prior quantum engine architecture decisions
+- ruQu crate: `crates/ruQu/src/` - existing syndrome processing and coherence gate infrastructure
+- ruVector memory system: pattern storage for cross-molecule VQE learning
diff --git a/docs/adr/quantum-engine/ADR-QE-006-grover-search-implementation.md b/docs/adr/quantum-engine/ADR-QE-006-grover-search-implementation.md
new file mode 100644
index 00000000..d4953cc8
--- /dev/null
+++ b/docs/adr/quantum-engine/ADR-QE-006-grover-search-implementation.md
@@ -0,0 +1,562 @@
+# ADR-QE-006: Grover's Search Algorithm Implementation
+
+**Status**: Proposed
+**Date**: 2026-02-06
+**Authors**: ruv.io, RuVector Team
+**Deciders**: Architecture Review Board
+
+## Version History
+
+| Version | Date | Author | Changes |
+|---------|------|--------|---------|
+| 0.1 | 2026-02-06 | ruv.io | Initial Grover's search architecture proposal |
+
+---
+
+## Context
+
+### Unstructured Search and Quadratic Speedup
+
+Grover's algorithm is one of the foundational quantum algorithms, providing a provable
+quadratic speedup for unstructured search. Given a search space of N = 2^n items and an
+oracle that marks one or more target items, Grover's algorithm finds a target in
+O(sqrt(N)) oracle queries, compared to the classical O(N) lower bound.
+
+### Building Blocks
+
+The algorithm consists of two principal components applied repeatedly:
+
+1. **Oracle (O)**: Flips the phase of marked (target) states
+   - On hardware: requires multi-controlled-Z decomposition into elementary gates
+   - In simulation: can be a single O(1) amplitude flip (key insight)
+
+2. **Diffuser (D)**: Inversion about the mean amplitude (also called the Grover diffusion
+   operator)
+   - D = 2|s><s| - I, where |s> is the uniform superposition
+   - Implemented as: H^{otimes n} * (2|0><0| - I) * H^{otimes n}
+
+### Why Simulation Unlocks a Unique Optimization
+
+On real quantum hardware, the oracle must be decomposed into a circuit of elementary
+gates. For a single marked state in n qubits, the oracle requires O(n) multi-controlled
+gates, each of which may need further decomposition. The full gate count is O(n^2) or
+worse depending on connectivity.
+
+In a state vector simulator, we have **direct access to the amplitude array**. The oracle
+for a known marked state at index t is simply:
+
+```
+amplitudes[t] *= -1
+```
+
+This is an O(1) operation, regardless of qubit count. This fundamentally changes the
+performance profile of Grover simulation.
+
+### Applications in ruVector
+
+| Application | Description |
+|-------------|-------------|
+| Vector DB search | Encode HNSW candidate filtering as a Grover oracle |
+| SAT solving | Map boolean satisfiability to oracle function |
+| Cryptographic analysis | Brute-force key search with quadratic speedup |
+| Database queries | Unstructured search over ruVector memory entries |
+| Algorithm benchmarking | Reference implementation for quantum advantage studies |
+
+---
+
+## Decision
+
+### 1. Oracle Implementation Strategy
+
+We provide two oracle modes: optimized index-based for known targets, and general
+unitary oracle for black-box functions.
+
+#### Mode A: Index-Based Oracle (O(1) per application)
+
+When the target index is known (or the oracle can be expressed as a predicate on
+basis state indices), we bypass gate decomposition entirely:
+
+```rust
+impl QuantumState {
+    /// Apply Grover oracle by direct amplitude negation.
+    ///
+    /// Flips the sign of amplitude at the given index.
+    /// This is an O(1) operation -- the key simulation advantage.
+    ///
+    /// On hardware, this would require O(n) multi-controlled gates
+    /// decomposed into O(n^2) elementary gates.
+    #[inline]
+    pub fn oracle_flip(&mut self, target_index: usize) {
+        debug_assert!(target_index < self.amplitudes.len());
+        self.amplitudes[target_index] = -self.amplitudes[target_index];
+    }
+
+    /// Apply Grover oracle for multiple marked states.
+    ///
+    /// Complexity: O(k) where k = number of marked states.
+    /// Hardware equivalent: O(k * n^2) gates.
+    pub fn oracle_flip_multi(&mut self, target_indices: &[usize]) {
+        for &idx in target_indices {
+            debug_assert!(idx < self.amplitudes.len());
+            self.amplitudes[idx] = -self.amplitudes[idx];
+        }
+    }
+}
+```
+
+**Why this is valid**: The oracle operator O is defined as the diagonal unitary
+O = I - 2|t><t|, which maps |t> to -|t> and leaves all other basis states unchanged.
+In the amplitude array, this is exactly `amplitudes[t] *= -1`. No physical gate
+decomposition is needed because we are simulating the mathematical operator directly.
+
+#### Mode B: General Unitary Oracle
+
+For black-box oracle functions where the marked states are not known in advance:
+
+```rust
+/// A general oracle as a unitary operation on the state vector.
+///
+/// The oracle function receives a basis state index and returns
+/// true if it should be marked (phase-flipped).
+pub trait GroverOracle: Send {
+    /// Evaluate whether basis state |index> is a target.
+    fn is_marked(&self, index: usize, n_qubits: usize) -> bool;
+}
+
+impl QuantumState {
+    /// Apply a general Grover oracle.
+    ///
+    /// Iterates over all 2^n amplitudes, evaluating the oracle predicate.
+    /// Complexity: O(2^n) per application (equivalent to hardware cost).
+    pub fn oracle_apply(&mut self, oracle: &dyn GroverOracle) {
+        let n_qubits = self.n_qubits;
+        for i in 0..self.amplitudes.len() {
+            if oracle.is_marked(i, n_qubits) {
+                self.amplitudes[i] = -self.amplitudes[i];
+            }
+        }
+    }
+}
+```
+
+### 2. Diffuser Implementation
+
+The Grover diffuser (inversion about the mean) is decomposed as:
+
+```
+D = H^{otimes n} * phase_flip(|0>) * H^{otimes n}
+```
+
+where `phase_flip(|0>)` flips the sign of the all-zeros state: (2|0><0| - I).
+
+```
+Diffuser Circuit Decomposition:
+
+|psi> ──[H]──[phase_flip(0)]──[H]──
+
+Expanded:
+
+         ┌───┐   ┌──────────────┐   ┌───┐
+  q[0] ──┤ H ├───┤              ├───┤ H ├──
+         └───┘   │              │   └───┘
+         ┌───┐   │  2|0><0| - I │   ┌───┐
+  q[1] ──┤ H ├───┤              ├───┤ H ├──
+         └───┘   │              │   └───┘
+         ┌───┐   │              │   ┌───┐
+  q[2] ──┤ H ├───┤              ├───┤ H ├──
+         └───┘   └──────────────┘   └───┘
+```
+
+Both the H^{otimes n} layers and the phase_flip(0) benefit from simulation optimizations:
+
+```rust
+impl QuantumState {
+    /// Apply Hadamard to all qubits.
+    ///
+    /// Optimized implementation using butterfly structure.
+    /// Complexity: O(n * 2^n)
+    pub fn hadamard_all(&mut self) {
+        for qubit in 0..self.n_qubits {
+            self.apply_hadamard(qubit);
+        }
+    }
+
+    /// Flip the phase of the |0...0> state.
+    ///
+    /// O(1) operation via direct indexing -- another simulation advantage.
+    /// On hardware, this requires an n-controlled-Z gate.
+    #[inline]
+    pub fn phase_flip_zero(&mut self) {
+        // |0...0> is at index 0
+        self.amplitudes[0] = -self.amplitudes[0];
+    }
+
+    /// Apply the full Grover diffuser.
+    ///
+    /// D = H^n * (2|0><0| - I) * H^n
+    ///
+    /// Implementation note: (2|0><0| - I) negates all states except |0>,
+    /// which is equivalent to a global phase of -1 followed by
+    /// flipping amplitude[0]. We use the phase_flip_zero + global negate
+    /// approach for efficiency.
+    pub fn grover_diffuser(&mut self) {
+        self.hadamard_all();
+
+        // Apply 2|0><0| - I:
+        // Negate all amplitudes, then flip sign of |0> again
+        // This gives: amp[0] -> amp[0], amp[k] -> -amp[k] for k != 0
+        for amp in self.amplitudes.iter_mut() {
+            *amp = -*amp;
+        }
+        self.amplitudes[0] = -self.amplitudes[0];
+
+        self.hadamard_all();
+    }
+}
+```
+
+### 3. Optimal Iteration Count
+
+The optimal number of Grover iterations for k marked states out of N = 2^n total:
+
+```
+iterations = floor(pi/4 * sqrt(N/k))
+```
+
+For a single marked state (k=1):
+
+| Qubits (n) | N = 2^n | Optimal Iterations | Classical Steps |
+|------------|---------|-------------------|----------------|
+| 4 | 16 | 3 | 16 |
+| 8 | 256 | 12 | 256 |
+| 12 | 4,096 | 50 | 4,096 |
+| 16 | 65,536 | 201 | 65,536 |
+| 20 | 1,048,576 | 804 | 1,048,576 |
+
+```rust
+/// Compute the optimal number of Grover iterations.
+///
+/// For k marked states in a search space of 2^n:
+///   iterations = floor(pi/4 * sqrt(2^n / k))
+pub fn optimal_iterations(n_qubits: usize, n_marked: usize) -> usize {
+    let n = (1_usize << n_qubits) as f64;
+    let k = n_marked as f64;
+    (std::f64::consts::FRAC_PI_4 * (n / k).sqrt()).floor() as usize
+}
+```
+
+### 4. Complete Grover Algorithm
+
+```rust
+/// Configuration for Grover's search.
+pub struct GroverConfig {
+    /// Number of qubits
+    pub n_qubits: usize,
+    /// Target indices (for index-based oracle)
+    pub targets: Vec<usize>,
+    /// Custom oracle (overrides targets if set)
+    pub oracle: Option<Box<dyn GroverOracle>>,
+    /// Override iteration count (auto-computed if None)
+    pub iterations: Option<usize>,
+    /// Number of measurement shots (for probabilistic result)
+    pub shots: usize,
+}
+
+/// Result of Grover's search.
+pub struct GroverResult {
+    /// Most likely measurement outcome (basis state index)
+    pub found_index: usize,
+    /// Probability of measuring the found state
+    pub success_probability: f64,
+    /// Number of Grover iterations performed
+    pub iterations: usize,
+    /// Total wall-clock time
+    pub elapsed: Duration,
+    /// Full probability distribution (optional, for analysis)
+    pub probabilities: Option<Vec<f64>>,
+}
+```
+
+**Pseudocode for the complete algorithm**:
+
+```rust
+pub fn grover_search(config: &GroverConfig) -> GroverResult {
+    let n = config.n_qubits;
+    let num_states = 1 << n;
+
+    // Step 1: Initialize uniform superposition
+    //         |s> = H^n |0...0> = (1/sqrt(N)) * sum_k |k>
+    let mut state = QuantumState::new(n);
+    state.hadamard_all();  // O(n * 2^n)
+
+    // Step 2: Determine iteration count
+    let k = config.targets.len();
+    let iterations = config.iterations
+        .unwrap_or_else(|| optimal_iterations(n, k));
+
+    // Step 3: Apply Grover iterations
+    for _iter in 0..iterations {
+        // Oracle: flip phase of marked states
+        match &config.oracle {
+            Some(oracle) => state.oracle_apply(oracle.as_ref()),
+            None => state.oracle_flip_multi(&config.targets),
+        }
+
+        // Diffuser: inversion about the mean
+        state.grover_diffuser();
+    }
+
+    // Step 4: Measure (find highest-probability state)
+    let probabilities: Vec<f64> = state.amplitudes.iter()
+        .map(|a| a.norm_sqr())
+        .collect();
+
+    let found_index = probabilities.iter()
+        .enumerate()
+        .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
+        .map(|(i, _)| i)
+        .unwrap();
+
+    GroverResult {
+        found_index,
+        success_probability: probabilities[found_index],
+        iterations,
+        elapsed: start.elapsed(),
+        probabilities: Some(probabilities),
+    }
+}
+```
+
+### 5. The O(1) Oracle Trick: Simulation-Unique Advantage
+
+This section formalizes the performance advantage unique to state vector simulation.
+
+**Hardware cost model** (per Grover iteration):
+
+```
+Oracle (hardware):
+  - Multi-controlled-Z gate: O(n) Toffoli gates
+  - Each Toffoli: ~6 CNOT + single-qubit gates
+  - Total: O(n) gates, each touching O(2^n) amplitudes in simulation
+  - Simulation cost: O(n * 2^n) per oracle application
+
+Diffuser (hardware):
+  - H^n: n Hadamard gates = O(n * 2^n) simulation ops
+  - Multi-controlled-Z: same as oracle = O(n * 2^n) simulation ops
+  - H^n: O(n * 2^n) again
+  - Total: O(n * 2^n) per diffuser
+
+Per iteration (hardware path): O(n * 2^n)
+Total (hardware path): O(n * 2^n * sqrt(2^n)) = O(n * 2^(3n/2))
+```
+
+**Simulation cost model** (with O(1) oracle optimization):
+
+```
+Oracle (optimized):
+  - Direct amplitude flip: O(1) for single target, O(k) for k targets
+  - Simulation cost: O(k)
+
+Diffuser (optimized):
+  - H^n: O(n * 2^n) -- unavoidable
+  - phase_flip(0): O(1) via direct index
+  - H^n: O(n * 2^n)
+  - Total: O(n * 2^n) per diffuser
+
+Per iteration (optimized): O(n * 2^n)  [dominated by diffuser]
+Total (optimized): O(n * 2^n * sqrt(2^n)) = O(n * 2^(3n/2))
+```
+
+The asymptotic complexity is the same (diffuser dominates), but the constant factor
+improvement is significant: the oracle step drops from O(n * 2^n) to O(k), saving
+roughly 50% of per-iteration time for single-target search.
+
+### 6. Multi-Target Grover Support
+
+When multiple states are marked (k > 1), the algorithm converges faster:
+
+```
+iterations(k) = floor(pi/4 * sqrt(N/k))
+```
+
+The success probability oscillates sinusoidally. For k targets:
+
+```
+P(success after t iterations) = sin^2((2t+1) * arcsin(sqrt(k/N)))
+```
+
+```rust
+/// Compute success probability after t Grover iterations.
+pub fn success_probability(n_qubits: usize, n_marked: usize, iterations: usize) -> f64 {
+    let n = (1_usize << n_qubits) as f64;
+    let k = n_marked as f64;
+    let theta = (k / n).sqrt().asin();
+    let angle = (2.0 * iterations as f64 + 1.0) * theta;
+    angle.sin().powi(2)
+}
+```
+
+**Over-iteration risk**: If too many iterations are applied, the algorithm starts
+"uncomputing" the answer. The success probability oscillates with period
+~pi * sqrt(N/k) / 2. Our implementation auto-computes the optimal count and warns
+if the user-specified count deviates significantly.
+
+### 7. Performance Benchmarks
+
+#### Measured Performance Estimates
+
+| Qubits | States | Iterations | Oracle Cost | Diffuser Cost | Total |
+|--------|--------|-----------|-------------|--------------|-------|
+| 4 | 16 | 3 | 3 * O(1) | 3 * O(64) | <0.01ms |
+| 8 | 256 | 12 | 12 * O(1) | 12 * O(2048) | <0.1ms |
+| 12 | 4,096 | 50 | 50 * O(1) | 50 * O(49K) | ~1ms |
+| 16 | 65,536 | 201 | 201 * O(1) | 201 * O(1M) | ~10ms |
+| 20 | 1,048,576 | 804 | 804 * O(1) | 804 * O(20M) | ~500ms |
+| 24 | 16,777,216 | 3,217 | 3217 * O(1) | 3217 * O(402M) | ~60s |
+
+**Gate-count equivalent** (for comparison with hardware gate-based simulation):
+
+| Qubits | Grover Iterations | Equivalent Gate Count | Index-Optimized Ops |
+|--------|------------------|----------------------|---------------------|
+| 8 | 12 | ~200 gates | ~25K ops |
+| 12 | 50 | ~1,500 gates | ~2.5M ops |
+| 16 | 201 | ~10,000 gates | ~200M ops |
+| 20 | 804 | ~60,000 gates | ~16B ops |
+
+The "gates" column counts oracle gates (decomposed) + diffuser gates. The "ops" column
+counts actual floating-point operations in the optimized simulation path. The ratio
+confirms that the O(1) oracle trick yields a roughly 2x constant-factor improvement
+for the overall search.
+
+### 8. Integration with HNSW Index for Hybrid Quantum-Classical Search
+
+A speculative but architecturally sound integration path connects Grover's search with
+ruVector's HNSW (Hierarchical Navigable Small World) index:
+
+```
+Hybrid Quantum-Classical Nearest-Neighbor Search
+=================================================
+
+Phase 1: Classical HNSW (coarse filtering)
+  - Navigate the HNSW graph to find candidate neighborhood
+  - Reduce search space from N to ~sqrt(N) candidates
+  - Time: O(log N)
+
+Phase 2: Grover's Search (fine filtering)
+  - Encode candidate set as Grover oracle
+  - Search for exact nearest neighbor among candidates
+  - Quadratic speedup over brute-force comparison
+  - Time: O(N^{1/4}) for sqrt(N) candidates
+
+Combined: O(log N + N^{1/4}) vs classical O(log N + sqrt(N))
+
+          ┌──────────────────────────────────────────────┐
+          │           HNSW Layer Navigation               │
+          │                                                │
+          │  Layer 3:  o ─────────── o ────── o           │
+          │            │                      │            │
+          │  Layer 2:  o ── o ────── o ── o ──o           │
+          │            │    │        │    │   │            │
+          │  Layer 1:  o─o──o──o──o──o─o──o──o─o          │
+          │            │ │  │  │  │  │ │  │  │ │          │
+          │  Layer 0:  o-o-oo-oo-oo-oo-o-oo-oo-o         │
+          │                    │                            │
+          │            ┌───────▼────────┐                  │
+          │            │ Candidate Pool │                  │
+          │            │  ~sqrt(N) items│                  │
+          │            └───────┬────────┘                  │
+          │                    │                            │
+          └────────────────────┼───────────────────────────┘
+                               │
+                    ┌──────────▼───────────┐
+                    │  Grover's Search     │
+                    │                      │
+                    │  Oracle: distance    │
+                    │  threshold on        │
+                    │  candidate indices   │
+                    │                      │
+                    │  O(N^{1/4}) queries  │
+                    └──────────────────────┘
+```
+
+This integration is facilitated by ruVector's existing HNSW implementation
+(150x-12,500x faster than baseline, per ruVector performance targets). The Grover
+oracle would encode a distance-threshold predicate: "is vector[i] within distance d
+of the query vector?"
+
+```rust
+/// Oracle that marks basis states corresponding to vectors
+/// within distance threshold of a query.
+pub struct HnswGroverOracle {
+    /// Candidate indices from HNSW coarse search
+    pub candidates: Vec<usize>,
+    /// Query vector
+    pub query: Vec<f32>,
+    /// Distance threshold
+    pub threshold: f32,
+    /// Pre-computed distances (for O(1) oracle evaluation)
+    pub distances: Vec<f32>,
+}
+
+impl GroverOracle for HnswGroverOracle {
+    fn is_marked(&self, index: usize, _n_qubits: usize) -> bool {
+        if index < self.distances.len() {
+            self.distances[index] <= self.threshold
+        } else {
+            false
+        }
+    }
+}
+```
+
+**Note**: This hybrid approach is currently theoretical for classical simulation.
+Its value lies in (a) algorithm prototyping for future quantum hardware, and
+(b) demonstrating integration patterns between quantum algorithms and classical
+data structures.
+
+---
+
+## Consequences
+
+### Benefits
+
+1. **O(1) oracle optimization** provides a 2x constant-factor speedup unique to state
+   vector simulation, making Grover's algorithm practical for up to 20+ qubits
+2. **Dual oracle modes** support both fast known-target search (index-based) and general
+   black-box function search (predicate-based)
+3. **Auto-computed iteration count** prevents over-iteration and ensures near-optimal
+   success probability
+4. **Multi-target support** handles the general case of k marked states with appropriate
+   iteration adjustment
+5. **HNSW integration path** provides a concrete vision for hybrid quantum-classical
+   search that leverages ruVector's existing vector database infrastructure
+
+### Risks
+
+| Risk | Probability | Impact | Mitigation |
+|------|------------|--------|------------|
+| Diffuser dominates runtime, limiting oracle optimization benefit | High | Low | Accept 2x improvement; focus on SIMD-optimized Hadamard |
+| Multi-target count unknown in practice | Medium | Medium | Quantum counting subroutine (future work) |
+| HNSW integration adds complexity with unclear practical advantage | Low | Low | Keep as optional module, prototype-only initially |
+| Over-iteration produces incorrect results | Low | High | Auto-compute + warning system + probability tracking |
+
+### Trade-offs
+
+| Decision | Advantage | Disadvantage |
+|----------|-----------|--------------|
+| O(1) index oracle | Massive speedup for known targets | Not applicable to true black-box search |
+| Auto iteration count | Prevents user error | Less flexible for advanced use cases |
+| General oracle trait | Supports arbitrary predicates | O(2^n) per application (no speedup over gates) |
+| Eager probability tracking | Enables convergence monitoring | Memory overhead for probability vector |
+
+---
+
+## References
+
+- Grover, L.K. "A fast quantum mechanical algorithm for database search." Proceedings of the 28th Annual ACM Symposium on Theory of Computing, 212-219 (1996)
+- Boyer, M., Brassard, G., Hoyer, P., Tapp, A. "Tight bounds on quantum searching." Fortschritte der Physik 46, 493-505 (1998)
+- Malviya, Y.K., Zapatero, R.A. "Quantum search algorithms for database search: A comprehensive review." arXiv:2311.01265 (2023)
+- ADR-001: ruQu Architecture - Classical Nervous System for Quantum Machines
+- ADR-QE-005: VQE Algorithm Support (parameterized circuits, expectation values)
+- ruVector HNSW implementation: 150x-12,500x faster pattern search (CLAUDE.md performance targets)
+- ruQu crate: `crates/ruQu/src/` - syndrome processing and state vector infrastructure
diff --git a/docs/adr/quantum-engine/ADR-QE-007-qaoa-maxcut-implementation.md b/docs/adr/quantum-engine/ADR-QE-007-qaoa-maxcut-implementation.md
new file mode 100644
index 00000000..305e0696
--- /dev/null
+++ b/docs/adr/quantum-engine/ADR-QE-007-qaoa-maxcut-implementation.md
@@ -0,0 +1,631 @@
+# ADR-QE-007: QAOA MaxCut Implementation
+
+**Status**: Proposed
+**Date**: 2026-02-06
+**Authors**: ruv.io, RuVector Team
+**Deciders**: Architecture Review Board
+
+## Version History
+
+| Version | Date | Author | Changes |
+|---------|------|--------|---------|
+| 0.1 | 2026-02-06 | ruv.io | Initial QAOA MaxCut architecture proposal |
+
+---
+
+## Context
+
+### Combinatorial Optimization on Quantum Computers
+
+The Quantum Approximate Optimization Algorithm (QAOA), introduced by Farhi, Goldstone,
+and Gutmann (2014), is a leading candidate for demonstrating quantum advantage on
+combinatorial optimization problems. QAOA constructs a parameterized quantum circuit that
+encodes the cost function of an optimization problem and uses classical outer-loop
+optimization to find parameters that maximize the expected cost.
+
+### MaxCut as the Canonical QAOA Problem
+
+MaxCut is the prototypical problem for QAOA: given a graph G = (V, E), partition the
+vertices into two sets S and S-complement to maximize the number of edges crossing the
+partition.
+
+```
+MaxCut Example (5 vertices, 6 edges):
+
+    0 ─── 1
+    │ \   │
+    │   \ │
+    3 ─── 2
+          │
+          4
+
+Optimal cut: S = {0, 2, 4}, S' = {1, 3}
+Cut value: 5 edges crossing (0-1, 0-3, 1-2, 2-3, 2-4)
+```
+
+The cost function is:
+
+```
+C(z) = sum_{(i,j) in E} (1 - z_i * z_j) / 2
+```
+
+where z_i in {+1, -1} encodes the partition assignment.
+
+### QAOA Circuit Structure
+
+A depth-p QAOA circuit alternates two types of layers:
+
+1. **Phase separation** (encodes the problem): For each edge (i,j), apply
+   exp(-i * gamma * Z_i Z_j / 2)
+2. **Mixing** (explores the solution space): For each qubit i, apply
+   exp(-i * beta * X_i) = Rx(2*beta)
+
+```
+QAOA Circuit (p layers):
+
+|+>  ──[Phase(gamma_1)]──[Mix(beta_1)]──[Phase(gamma_2)]──[Mix(beta_2)]── ... ──[Measure]
+                                                                                      │
+Parameters: gamma = [gamma_1, ..., gamma_p], beta = [beta_1, ..., beta_p]            │
+                                                                                      ▼
+                                                                              Classical
+                                                                              Optimizer
+```
+
+### Why QAOA Matters for ruQu
+
+| Motivation | Details |
+|------------|---------|
+| Optimization benchmarks | Standard workload for evaluating quantum simulator performance |
+| Graph problems | Natural integration with ruVector graph database (ruvector-graph) |
+| Variational algorithm | Shares infrastructure with VQE (ADR-QE-005): parameterized circuits, expectation values, classical optimizers |
+| Scalability study | QAOA depth and graph size provide tunable complexity for benchmarking |
+| Agent integration | ruVector agents can use QAOA to solve graph optimization tasks autonomously |
+
+---
+
+## Decision
+
+### 1. Phase Separation Operator: Native Rzz Gate
+
+The phase separation operator for MaxCut applies exp(-i * gamma * Z_i Z_j / 2) for
+each edge (i,j). We implement this as a native two-qubit operation via direct amplitude
+manipulation, avoiding CNOT decomposition.
+
+**Mathematical basis**:
+
+```
+exp(-i * theta * Z_i Z_j / 2) acts on computational basis states as:
+
+  |00> -> e^{-i*theta/2} |00>    (Z_i Z_j = +1)
+  |01> -> e^{+i*theta/2} |01>    (Z_i Z_j = -1)
+  |10> -> e^{+i*theta/2} |10>    (Z_i Z_j = -1)
+  |11> -> e^{-i*theta/2} |11>    (Z_i Z_j = +1)
+```
+
+In the state vector, for each amplitude at index k:
+- Extract bits i and j from k
+- Compute parity = bit_i XOR bit_j
+- Apply phase: `amp[k] *= exp(-i * theta * (-1)^parity / 2)`
+  - If parity = 0 (same bits): `amp[k] *= exp(-i * theta / 2)`
+  - If parity = 1 (different bits): `amp[k] *= exp(+i * theta / 2)`
+
+```rust
+impl QuantumState {
+    /// Apply Rzz(theta) = exp(-i * theta * Z_i Z_j / 2) via direct amplitude
+    /// manipulation.
+    ///
+    /// For each basis state |k>:
+    ///   - Compute parity of bits i and j in k
+    ///   - Apply phase e^{-i * theta * (-1)^parity / 2}
+    ///
+    /// Complexity: O(2^n) -- single pass over state vector.
+    /// Vectorizable: all amplitudes are independent (no swaps).
+    ///
+    /// Hardware equivalent: CNOT(i,j) + Rz(theta, j) + CNOT(i,j) = 3 gates.
+    pub fn rzz(&mut self, theta: f64, qubit_i: usize, qubit_j: usize) {
+        let phase_same = Complex64::from_polar(1.0, -theta / 2.0);
+        let phase_diff = Complex64::from_polar(1.0, theta / 2.0);
+
+        let mask_i = 1_usize << qubit_i;
+        let mask_j = 1_usize << qubit_j;
+
+        for k in 0..self.amplitudes.len() {
+            let bit_i = (k & mask_i) >> qubit_i;
+            let bit_j = (k & mask_j) >> qubit_j;
+            let parity = bit_i ^ bit_j;
+
+            if parity == 0 {
+                self.amplitudes[k] *= phase_same;
+            } else {
+                self.amplitudes[k] *= phase_diff;
+            }
+        }
+    }
+}
+```
+
+**Vectorization opportunity**: The inner loop is a streaming operation over the amplitude
+array with no data dependencies between iterations. This is ideal for SIMD vectorization
+(AVX-512 can process 8 complex64 values per instruction) and parallelization across
+cores.
+
+### 2. Mixing Operator
+
+The mixing operator applies Rx(2*beta) to each qubit:
+
+```
+Rx(2*beta) = exp(-i * beta * X) = [[cos(beta), -i*sin(beta)],
+                                     [-i*sin(beta), cos(beta)]]
+```
+
+This uses the standard single-qubit gate application from the simulator core:
+
+```rust
+impl QuantumState {
+    /// Apply the QAOA mixing operator: Rx(2*beta) on each qubit.
+    ///
+    /// Complexity: O(n * 2^n) for n qubits.
+    pub fn qaoa_mixing(&mut self, beta: f64) {
+        for qubit in 0..self.n_qubits {
+            self.rx(2.0 * beta, qubit);
+        }
+    }
+}
+```
+
+### 3. QAOA Circuit Construction
+
+A convenience function builds the full QAOA circuit from a graph and parameters:
+
+```rust
+/// A graph represented as an edge list with optional weights.
+pub struct Graph {
+    /// Number of vertices
+    pub n_vertices: usize,
+    /// Edges: (vertex_i, vertex_j, weight)
+    pub edges: Vec<(usize, usize, f64)>,
+}
+
+impl Graph {
+    /// Construct from adjacency list.
+    pub fn from_adjacency_list(adj: &[Vec<usize>]) -> Self;
+
+    /// Construct from edge list (unweighted, weight = 1.0).
+    pub fn from_edge_list(n_vertices: usize, edges: &[(usize, usize)]) -> Self;
+
+    /// Load from ruVector graph query result.
+    pub fn from_ruvector_query(result: &GraphQueryResult) -> Self;
+}
+
+/// QAOA configuration.
+pub struct QaoaConfig {
+    /// Graph defining the MaxCut instance
+    pub graph: Graph,
+    /// QAOA depth (number of layers)
+    pub p: usize,
+    /// Gamma parameters (phase separation angles), length = p
+    pub gammas: Vec<f64>,
+    /// Beta parameters (mixing angles), length = p
+    pub betas: Vec<f64>,
+}
+
+/// Build and simulate a QAOA circuit for MaxCut.
+///
+/// Circuit structure for depth p:
+///   1. Initialize |+>^n (Hadamard on all qubits)
+///   2. For layer l = 1..p:
+///      a. Phase separation: Rzz(gamma_l, i, j) for each edge (i,j)
+///      b. Mixing: Rx(2*beta_l) on each qubit
+///   3. Return final state
+pub fn build_qaoa_circuit(config: &QaoaConfig) -> QuantumState {
+    let n = config.graph.n_vertices;
+    let mut state = QuantumState::new(n);
+
+    // Step 1: Initialize uniform superposition
+    state.hadamard_all();
+
+    // Step 2: Alternating phase separation and mixing layers
+    for layer in 0..config.p {
+        let gamma = config.gammas[layer];
+        let beta = config.betas[layer];
+
+        // Phase separation: apply Rzz for each edge
+        for &(i, j, weight) in &config.graph.edges {
+            state.rzz(gamma * weight, i, j);
+        }
+
+        // Mixing: Rx(2*beta) on each qubit
+        state.qaoa_mixing(beta);
+    }
+
+    state
+}
+```
+
+**Pseudocode for the complete QAOA MaxCut solver**:
+
+```rust
+pub fn qaoa_maxcut(
+    graph: &Graph,
+    p: usize,
+    optimizer: &mut dyn ClassicalOptimizer,
+    config: &QaoaOptConfig,
+) -> QaoaResult {
+    let n_params = 2 * p; // p gammas + p betas
+    optimizer.initialize(n_params);
+
+    let mut params = config.initial_params.clone()
+        .unwrap_or_else(|| {
+            // Standard initialization: gamma in [0, pi], beta in [0, pi/2]
+            let mut p_init = vec![0.0; n_params];
+            for i in 0..p {
+                p_init[i] = 0.5;          // gamma_i
+                p_init[p + i] = 0.25;     // beta_i
+            }
+            p_init
+        });
+
+    let mut best_cost = f64::NEG_INFINITY;
+    let mut best_params = params.clone();
+    let mut history = Vec::new();
+
+    for iteration in 0..config.max_iterations {
+        let gammas = params[..p].to_vec();
+        let betas = params[p..].to_vec();
+
+        // Build and simulate circuit
+        let qaoa_config = QaoaConfig {
+            graph: graph.clone(),
+            p,
+            gammas,
+            betas,
+        };
+        let state = build_qaoa_circuit(&qaoa_config);
+
+        // Evaluate MaxCut cost function
+        let cost = maxcut_expectation(&state, graph);
+
+        if cost > best_cost {
+            best_cost = cost;
+            best_params = params.clone();
+        }
+
+        // Gradient computation (parameter-shift rule, same as VQE)
+        let grad = if optimizer.needs_gradient() {
+            Some(qaoa_gradient(graph, p, &params))
+        } else {
+            None
+        };
+
+        history.push(QaoaIteration { iteration, cost, params: params.clone() });
+
+        let result = optimizer.step(&params, -cost, grad.as_deref());
+        // Note: negate cost because optimizer minimizes
+        params = result.new_params;
+
+        if result.converged {
+            break;
+        }
+    }
+
+    // Sample the final state to get candidate cuts
+    let final_state = build_qaoa_circuit(&QaoaConfig {
+        graph: graph.clone(),
+        p,
+        gammas: best_params[..p].to_vec(),
+        betas: best_params[p..].to_vec(),
+    });
+    let best_cut = sample_maxcut(&final_state, graph, config.sample_shots);
+
+    QaoaResult {
+        best_cost,
+        best_params,
+        best_cut,
+        iterations: history.len(),
+        history,
+        approximation_ratio: best_cost / graph.max_cut_upper_bound(),
+    }
+}
+```
+
+### 4. Cost Function Evaluation
+
+The MaxCut cost function in Pauli operator form is:
+
+```
+C = sum_{(i,j) in E} w_{ij} * (1 - Z_i Z_j) / 2
+```
+
+This reuses the PauliSum expectation API from ADR-QE-005:
+
+```rust
+/// Compute the MaxCut cost as the expectation value of the cost Hamiltonian.
+///
+/// C = sum_{(i,j) in E} w_ij * (1 - Z_i Z_j) / 2
+///   = sum_{(i,j) in E} w_ij/2 - sum_{(i,j) in E} w_ij/2 * Z_i Z_j
+///   = const - sum_{(i,j)} w_ij/2 * <Z_i Z_j>
+///
+/// Each Z_i Z_j expectation is computed via the efficient diagonal trick:
+/// <psi| Z_i Z_j |psi> = sum_k |amp_k|^2 * (-1)^{bit_i(k) XOR bit_j(k)}
+pub fn maxcut_expectation(state: &QuantumState, graph: &Graph) -> f64 {
+    let mut cost = 0.0;
+
+    for &(i, j, weight) in &graph.edges {
+        let mask_i = 1_usize << i;
+        let mask_j = 1_usize << j;
+
+        let mut zz_expectation = 0.0;
+        for k in 0..state.amplitudes.len() {
+            let bit_i = (k & mask_i) >> i;
+            let bit_j = (k & mask_j) >> j;
+            let parity = bit_i ^ bit_j;
+            let sign = 1.0 - 2.0 * parity as f64; // +1 if same, -1 if different
+            zz_expectation += state.amplitudes[k].norm_sqr() * sign;
+        }
+
+        cost += weight * (1.0 - zz_expectation) / 2.0;
+    }
+
+    cost
+}
+```
+
+**Optimization**: Since Z_i Z_j is diagonal in the computational basis, the expectation
+reduces to a weighted sum over probabilities. No amplitude swapping is needed, and the
+computation is embarrassingly parallel.
+
+### 5. Sampling Mode
+
+In addition to exact expectation values, we support sampling the final state to
+obtain candidate cuts:
+
+```rust
+/// Sample the QAOA state to find candidate MaxCut solutions.
+///
+/// Returns the best cut found across `shots` samples.
+pub fn sample_maxcut(
+    state: &QuantumState,
+    graph: &Graph,
+    shots: usize,
+) -> MaxCutSolution {
+    let probabilities: Vec<f64> = state.amplitudes.iter()
+        .map(|a| a.norm_sqr())
+        .collect();
+
+    let mut best_cut_value = 0.0;
+    let mut best_bitstring = 0_usize;
+    let mut rng = thread_rng();
+
+    for _ in 0..shots {
+        // Sample from probability distribution
+        let sample = sample_from_distribution(&probabilities, &mut rng);
+
+        // Evaluate cut value for this bitstring
+        let cut_value = evaluate_cut(sample, graph);
+
+        if cut_value > best_cut_value {
+            best_cut_value = cut_value;
+            best_bitstring = sample;
+        }
+    }
+
+    MaxCutSolution {
+        partition: best_bitstring,
+        cut_value: best_cut_value,
+        set_s: (0..graph.n_vertices)
+            .filter(|&v| (best_bitstring >> v) & 1 == 1)
+            .collect(),
+        set_s_complement: (0..graph.n_vertices)
+            .filter(|&v| (best_bitstring >> v) & 1 == 0)
+            .collect(),
+    }
+}
+```
+
+### 6. Graph Interface
+
+Three input modes cover common use cases:
+
+```rust
+impl Graph {
+    /// From adjacency list (unweighted).
+    ///
+    /// Example: adj[0] = [1, 3] means vertex 0 connects to 1 and 3.
+    pub fn from_adjacency_list(adj: &[Vec<usize>]) -> Self {
+        let n = adj.len();
+        let mut edges = Vec::new();
+        let mut seen = std::collections::HashSet::new();
+
+        for (u, neighbors) in adj.iter().enumerate() {
+            for &v in neighbors {
+                let edge = if u < v { (u, v) } else { (v, u) };
+                if seen.insert(edge) {
+                    edges.push((edge.0, edge.1, 1.0));
+                }
+            }
+        }
+
+        Self { n_vertices: n, edges }
+    }
+
+    /// From edge list with uniform weight.
+    pub fn from_edge_list(n_vertices: usize, edge_list: &[(usize, usize)]) -> Self {
+        Self {
+            n_vertices,
+            edges: edge_list.iter().map(|&(u, v)| (u, v, 1.0)).collect(),
+        }
+    }
+
+    /// From ruVector graph database query result.
+    ///
+    /// Enables QAOA MaxCut on graphs stored in ruvector-graph.
+    pub fn from_ruvector_query(result: &GraphQueryResult) -> Self {
+        // Convert ruvector-graph nodes and edges to QAOA format
+        // Vertex IDs are remapped to contiguous 0..n range
+        todo!()
+    }
+}
+```
+
+### 7. Tensor Network Optimization for Sparse Graphs
+
+For sparse or planar graphs, the QAOA state can be represented more efficiently using
+tensor network contraction. The key insight is that QAOA circuits have a structure
+dictated by the graph topology:
+
+```
+Tensor Network View of QAOA:
+
+  Qubit 0: ──[H]──[Rzz(0,1)]──[Rzz(0,3)]──[Rx]── ...
+  Qubit 1: ──[H]──[Rzz(0,1)]──[Rzz(1,2)]──[Rx]── ...
+  Qubit 2: ──[H]──[Rzz(1,2)]──[Rzz(2,3)]──[Rx]── ...
+  Qubit 3: ──[H]──[Rzz(0,3)]──[Rzz(2,3)]──[Rx]── ...
+
+For a planar graph with treewidth w, tensor contraction costs O(2^w * poly(n))
+instead of O(2^n). For many practical graphs, w << n.
+```
+
+```rust
+/// Detect graph treewidth and decide simulation strategy.
+pub fn select_simulation_strategy(graph: &Graph) -> SimulationStrategy {
+    let treewidth = estimate_treewidth(graph);
+    let n = graph.n_vertices;
+
+    if treewidth <= 20 && n > 24 {
+        // Tensor network contraction is cheaper than full state vector
+        SimulationStrategy::TensorNetwork {
+            contraction_order: compute_contraction_order(graph),
+            estimated_cost: (1 << treewidth) * n * n,
+        }
+    } else {
+        SimulationStrategy::StateVector {
+            estimated_cost: 1 << n,
+        }
+    }
+}
+
+pub enum SimulationStrategy {
+    StateVector { estimated_cost: usize },
+    TensorNetwork {
+        contraction_order: Vec<ContractionStep>,
+        estimated_cost: usize,
+    },
+}
+```
+
+### 8. Performance Analysis
+
+#### Gate Counts and Timing
+
+For a graph with n vertices, m edges, and QAOA depth p:
+
+| Operation | Gate Count per Layer | Total Gates (p layers) |
+|-----------|---------------------|----------------------|
+| Phase separation (Rzz) | m | p * m |
+| Mixing (Rx) | n | p * n |
+| **Total per layer** | **m + n** | **p * (m + n)** |
+
+**Benchmark estimates**:
+
+| Configuration | n | m | p | Total Gates | Estimated Time |
+|---------------|---|---|---|-------------|---------------|
+| Small triangle | 3 | 3 | 1 | 6 | <0.01ms |
+| Petersen graph | 10 | 15 | 3 | 75 | <0.1ms |
+| Random d-reg (d=3) | 10 | 15 | 5 | 125 | <0.5ms |
+| Grid 4x5 | 20 | 31 | 3 | 189 | ~50ms |
+| Grid 4x5 | 20 | 31 | 5 | 315 | ~100ms |
+| Random d-reg (d=4) | 20 | 40 | 5 | 400 | ~200ms |
+| Dense (complete) | 20 | 190 | 3 | 630 | ~300ms |
+| Sparse large | 24 | 36 | 3 | 216 | ~5s |
+| Dense large | 24 | 276 | 5 | 1500 | ~30s |
+
+**Memory requirements**:
+
+| Qubits | State Vector Size | Memory |
+|--------|------------------|--------|
+| 10 | 1,024 | 16 KB |
+| 16 | 65,536 | 1 MB |
+| 20 | 1,048,576 | 16 MB |
+| 24 | 16,777,216 | 256 MB |
+| 28 | 268,435,456 | 4 GB |
+
+### 9. Integration with ruvector-graph
+
+The connection to ruVector's graph database enables a powerful workflow:
+
+```
+┌─────────────────────────────────────────────────────────────────────┐
+│                  QAOA MaxCut Pipeline                                 │
+│                                                                       │
+│  ┌──────────────┐     ┌────────────────┐     ┌──────────────────┐   │
+│  │ ruvector-graph│     │  QAOA Engine   │     │  Result Store    │   │
+│  │              │     │                │     │                  │   │
+│  │  Query:      │────>│  Build circuit │────>│  Optimal cut     │   │
+│  │  "find all   │     │  Optimize      │     │  Partition       │   │
+│  │   connected  │     │  Sample        │     │  Approximation   │   │
+│  │   subgraphs  │     │                │     │  ratio           │   │
+│  │   of size k" │     │                │     │                  │   │
+│  └──────────────┘     └────────────────┘     └──────────────────┘   │
+│                                                                       │
+│  Data Flow:                                                           │
+│  1. Agent queries ruvector-graph for subgraph                        │
+│  2. Graph converted to QAOA format via Graph::from_ruvector_query()  │
+│  3. QAOA optimizer runs with configurable depth p                     │
+│  4. Results stored in ruVector memory for pattern learning            │
+│  5. Agent uses learned patterns to choose p and initial parameters    │
+└─────────────────────────────────────────────────────────────────────┘
+```
+
+The ruvector-mincut integration is particularly relevant: the existing
+`SubpolynomialMinCut` algorithm (El-Hayek/Henzinger/Li, O(n^{o(1)}) amortized) provides
+exact min-cut values that serve as a lower bound for MaxCut verification. QAOA solutions
+can be validated against this classical baseline.
+
+---
+
+## Consequences
+
+### Benefits
+
+1. **Native Rzz gate** via direct amplitude manipulation avoids CNOT decomposition,
+   yielding a simpler and faster phase separation implementation
+2. **PauliSum expectation API reuse** from ADR-QE-005 provides a unified interface for
+   all variational algorithms (VQE, QAOA, and future extensions)
+3. **Graph interface flexibility** supports adjacency lists, edge lists, and ruVector
+   graph queries, covering the most common input formats
+4. **Tensor network fallback** for low-treewidth graphs extends QAOA to larger problem
+   instances than pure state vector simulation allows
+5. **ruvector-graph integration** enables a seamless pipeline from graph storage to
+   quantum optimization to result analysis
+
+### Risks
+
+| Risk | Probability | Impact | Mitigation |
+|------|------------|--------|------------|
+| QAOA at low depth p gives poor approximation ratios | High | Medium | Support high-p QAOA, classical warm-starting |
+| Treewidth estimation is NP-hard in general | Medium | Low | Use heuristic upper bounds (min-degree, greedy) |
+| Parameter landscape has many local minima | Medium | Medium | Multi-start optimization, INTERP initialization |
+| Large dense graphs exhaust memory | Medium | High | Tensor network fallback, graph coarsening |
+
+### Trade-offs
+
+| Decision | Advantage | Disadvantage |
+|----------|-----------|--------------|
+| Direct Rzz over CNOT decomposition | Simpler, faster | Not a one-to-one hardware circuit mapping |
+| Exact expectation over sampling | No statistical noise | Does not model real hardware shot noise |
+| Automatic strategy selection | Transparent to user | Additional complexity in simulation backend |
+| Integrated graph interface | Seamless workflow | Coupling to ruvector-graph API |
+
+---
+
+## References
+
+- Farhi, E., Goldstone, J., Gutmann, S. "A Quantum Approximate Optimization Algorithm." arXiv:1411.4028 (2014)
+- Hadfield, S. et al. "From the Quantum Approximate Optimization Algorithm to a Quantum Alternating Operator Ansatz." Algorithms 12, 34 (2019)
+- Zhou, L. et al. "Quantum Approximate Optimization Algorithm: Performance, Mechanism, and Implementation on Near-Term Devices." Physical Review X 10, 021067 (2020)
+- Guerreschi, G.G., Matsuura, A.Y. "QAOA for Max-Cut requires hundreds of qubits for quantum speed-up." Scientific Reports 9, 6903 (2019)
+- ADR-001: ruQu Architecture - Classical Nervous System for Quantum Machines
+- ADR-QE-005: VQE Algorithm Support (shared parameterized circuit and optimizer infrastructure)
+- ADR-QE-006: Grover's Search Implementation (quantum state manipulation primitives)
+- ruvector-mincut: `crates/ruvector-mincut/` - El-Hayek/Henzinger/Li subpolynomial min-cut
+- ruvector-graph: graph database integration for sourcing MaxCut instances
diff --git a/docs/adr/quantum-engine/ADR-QE-008-surface-code-error-correction.md b/docs/adr/quantum-engine/ADR-QE-008-surface-code-error-correction.md
new file mode 100644
index 00000000..54cdcfd6
--- /dev/null
+++ b/docs/adr/quantum-engine/ADR-QE-008-surface-code-error-correction.md
@@ -0,0 +1,997 @@
+# ADR-QE-008: Surface Code Error Correction Simulation
+
+**Status**: Proposed
+**Date**: 2026-02-06
+**Authors**: ruv.io, RuVector Team
+**Deciders**: Architecture Review Board
+
+## Version History
+
+| Version | Date | Author | Changes |
+|---------|------|--------|---------|
+| 0.1 | 2026-02-06 | ruv.io | Initial surface code QEC simulation proposal |
+
+---
+
+## Context
+
+### The Importance of QEC Simulation
+
+Quantum Error Correction (QEC) is the bridge between noisy intermediate-scale quantum
+(NISQ) devices and fault-tolerant quantum computing. Before deploying error correction
+on real hardware, every aspect of the QEC stack must be validated through simulation:
+
+1. **Decoder validation**: Verify that decoding algorithms (MWPM, Union-Find, neural
+   decoders) produce correct corrections under various noise models
+2. **Threshold estimation**: Determine the physical error rate below which logical error
+   rate decreases with increasing code distance
+3. **Architecture exploration**: Compare surface code layouts, flag qubit placements, and
+   scheduling strategies
+4. **Noise model development**: Test decoder robustness against realistic noise (correlated
+   errors, leakage, crosstalk)
+
+### Surface Codes as the Leading Architecture
+
+The surface code is the most promising QEC architecture for superconducting qubit
+platforms due to:
+
+| Property | Value |
+|----------|-------|
+| Error threshold | ~1% (highest among practical codes) |
+| Connectivity | Nearest-neighbor only (matches hardware) |
+| Syndrome extraction | Local stabilizer measurements |
+| Decoding | Efficient MWPM, Union-Find in O(n * alpha(n)) |
+
+### Surface Code Layout (Distance-3)
+
+```
+Distance-3 Rotated Surface Code:
+
+Data qubits: D0..D8 (9 total)
+X-stabilizers: X0..X3 (4 ancilla qubits)
+Z-stabilizers: Z0..Z3 (4 ancilla qubits)
+
+    Z0          Z1
+  /    \      /    \
+D0 ──── D1 ──── D2
+|  X0   |  X1   |
+D3 ──── D4 ──── D5
+|  X2   |  X3   |
+D6 ──── D7 ──── D8
+  \    /      \    /
+    Z2          Z3
+
+Qubit count: 9 data + 8 ancilla = 17 total qubits
+State vector: 2^17 = 131,072 complex amplitudes
+Memory: 2 MB per state vector
+```
+
+### What ruQu Provides Today
+
+The existing ruQu crate already implements key components for error correction:
+
+| Component | Module | Status |
+|-----------|--------|--------|
+| Syndrome processing | `syndrome.rs` | Production-ready (1M rounds/sec) |
+| MWPM decoder | `decoder.rs` | Integrated via fusion-blossom |
+| Min-cut coherence | `mincut.rs` | El-Hayek/Henzinger/Li algorithm |
+| Three-filter pipeline | `filters.rs` | Structural + Shift + Evidence |
+| Tile architecture | `tile.rs`, `fabric.rs` | 256-tile WASM fabric |
+| Stim integration | `stim.rs` | Syndrome generation |
+
+What is **missing** is the ability to simulate the full quantum state evolution of a
+surface code cycle: ancilla initialization, stabilizer circuits, projective measurement,
+state collapse, decoder feedback, and correction application. This ADR fills that gap.
+
+### Requirements
+
+| Requirement | Description | Priority |
+|-------------|-------------|----------|
+| Mid-circuit measurement | Projective measurement of individual qubits | P0 |
+| Qubit reset | Reinitialize ancilla qubits to |0> each cycle | P0 |
+| Conditional operations | Apply gates conditioned on measurement outcomes | P0 |
+| Noise injection | Depolarizing, bit-flip, phase-flip channels | P0 |
+| Syndrome extraction | Extract syndrome bits from ancilla measurements | P0 |
+| Decoder integration | Feed syndromes to MWPM/min-cut decoder | P0 |
+| Logical error tracking | Determine if logical error occurred | P1 |
+| Multi-cycle simulation | Run thousands of QEC cycles efficiently | P1 |
+| Leakage modeling | Simulate qubit leakage to non-computational states | P2 |
+
+---
+
+## Decision
+
+### 1. Mid-Circuit Measurement
+
+Mid-circuit measurement is the most critical new capability. Unlike final-state
+measurement (which collapses the entire state), mid-circuit measurement collapses a
+single qubit while preserving the rest of the system for continued evolution.
+
+**Mathematical formulation**:
+
+For measuring qubit q in the computational basis:
+
+1. Split the state into two subspaces:
+   - |psi_0>: amplitudes where qubit q = 0
+   - |psi_1>: amplitudes where qubit q = 1
+2. Compute probabilities:
+   - P(0) = ||psi_0||^2 = sum_{k: bit_q(k)=0} |amp_k|^2
+   - P(1) = ||psi_1||^2 = sum_{k: bit_q(k)=1} |amp_k|^2
+3. Sample outcome m in {0, 1} according to P(0), P(1)
+4. Collapse: zero out amplitudes in the non-selected subspace
+5. Renormalize: divide remaining amplitudes by sqrt(P(m))
+
+```rust
+/// Result of a mid-circuit measurement.
+pub struct MeasurementResult {
+    /// The measured qubit index
+    pub qubit: usize,
+    /// The measurement outcome (0 or 1)
+    pub outcome: u8,
+    /// The probability of this outcome
+    pub probability: f64,
+}
+
+impl QuantumState {
+    /// Perform a projective measurement on a single qubit.
+    ///
+    /// This collapses the qubit to |0> or |1> based on Born probabilities,
+    /// zeroes out amplitudes in the rejected subspace, and renormalizes.
+    ///
+    /// The remaining qubits are left in a valid quantum state for continued
+    /// simulation (essential for mid-circuit measurement in QEC).
+    ///
+    /// Complexity: O(2^n) -- two passes over the state vector.
+    ///   Pass 1: Compute probabilities P(0), P(1)
+    ///   Pass 2: Collapse and renormalize
+    pub fn measure_qubit(
+        &mut self,
+        qubit: usize,
+        rng: &mut impl Rng,
+    ) -> MeasurementResult {
+        let mask = 1_usize << qubit;
+        let n = self.amplitudes.len();
+
+        // Pass 1: Compute P(0) and P(1)
+        let mut prob_0 = 0.0_f64;
+        let mut prob_1 = 0.0_f64;
+
+        for k in 0..n {
+            let p = self.amplitudes[k].norm_sqr();
+            if (k & mask) == 0 {
+                prob_0 += p;
+            } else {
+                prob_1 += p;
+            }
+        }
+
+        // Sample outcome
+        let outcome = if rng.gen::<f64>() < prob_0 { 0_u8 } else { 1_u8 };
+        let prob_selected = if outcome == 0 { prob_0 } else { prob_1 };
+        let norm_factor = 1.0 / prob_selected.sqrt();
+
+        // Pass 2: Collapse and renormalize
+        for k in 0..n {
+            let bit = ((k & mask) >> qubit) as u8;
+            if bit == outcome {
+                self.amplitudes[k] *= norm_factor;
+            } else {
+                self.amplitudes[k] = Complex64::zero();
+            }
+        }
+
+        MeasurementResult {
+            qubit,
+            outcome,
+            probability: prob_selected,
+        }
+    }
+
+    /// Measure multiple qubits (ancilla register).
+    ///
+    /// Measures each qubit sequentially. The order matters because each
+    /// measurement collapses the state before the next measurement.
+    /// For stabilizer measurements, this correctly handles correlated outcomes.
+    pub fn measure_qubits(
+        &mut self,
+        qubits: &[usize],
+        rng: &mut impl Rng,
+    ) -> Vec<MeasurementResult> {
+        qubits.iter()
+            .map(|&q| self.measure_qubit(q, rng))
+            .collect()
+    }
+}
+```
+
+### 2. Qubit Reset
+
+Ancilla qubits must be reinitialized to |0> at the start of each syndrome extraction
+cycle. The reset operation projects onto the |0> subspace and renormalizes:
+
+```rust
+impl QuantumState {
+    /// Reset a qubit to |0>.
+    ///
+    /// Zeroes out all amplitudes where qubit q = 1, then renormalizes.
+    /// This is equivalent to measuring the qubit and, if the outcome is |1>,
+    /// applying an X gate to flip it back to |0>.
+    ///
+    /// Complexity: O(2^n) -- single pass over state vector.
+    ///
+    /// Used for ancilla reinitialization in each QEC cycle.
+    pub fn reset_qubit(&mut self, qubit: usize) {
+        let mask = 1_usize << qubit;
+        let partner_mask = !mask;
+        let n = self.amplitudes.len();
+
+        // For each pair of states (k, k XOR mask), move amplitude from
+        // the |1> component to the |0> component.
+        // This implements: |0><0| + |0><1| (measure-then-flip).
+        //
+        // Simpler approach: zero out |1> subspace, renormalize.
+        let mut norm_sq = 0.0_f64;
+
+        for k in 0..n {
+            if (k & mask) != 0 {
+                // Qubit q is |1> in this basis state
+                // Transfer amplitude to partner state with q = |0>
+                let partner = k & partner_mask;
+                // Coherent reset: add amplitudes
+                // For incoherent reset (thermal): would zero out instead
+                self.amplitudes[partner] += self.amplitudes[k];
+                self.amplitudes[k] = Complex64::zero();
+            }
+        }
+
+        // Renormalize
+        for k in 0..n {
+            norm_sq += self.amplitudes[k].norm_sqr();
+        }
+        let norm_factor = 1.0 / norm_sq.sqrt();
+        for amp in self.amplitudes.iter_mut() {
+            *amp *= norm_factor;
+        }
+    }
+}
+```
+
+### 3. Noise Model
+
+We implement three standard noise channels plus a combined depolarizing model.
+Noise is applied by stochastically inserting Pauli gates after specified operations.
+
+```
+Noise Channels:
+
+Bit-flip (X):     rho -> (1-p) * rho + p * X * rho * X
+Phase-flip (Z):   rho -> (1-p) * rho + p * Z * rho * Z
+Depolarizing:     rho -> (1-p) * rho + p/3 * (X*rho*X + Y*rho*Y + Z*rho*Z)
+```
+
+For state vector simulation, noise is applied via **stochastic Pauli insertion**:
+
+```rust
+/// Noise model configuration.
+#[derive(Debug, Clone)]
+pub struct NoiseModel {
+    /// Single-qubit gate error rate
+    pub single_qubit_error: f64,
+    /// Two-qubit gate error rate
+    pub two_qubit_error: f64,
+    /// Measurement error rate (readout bit-flip)
+    pub measurement_error: f64,
+    /// Idle error rate (per qubit per cycle)
+    pub idle_error: f64,
+    /// Noise type
+    pub noise_type: NoiseType,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum NoiseType {
+    /// Random X errors with probability p
+    BitFlip,
+    /// Random Z errors with probability p
+    PhaseFlip,
+    /// Random X, Y, or Z errors each with probability p/3
+    Depolarizing,
+    /// Independent bit-flip (p_x) and phase-flip (p_z)
+    Independent { p_x: f64, p_z: f64 },
+}
+
+impl QuantumState {
+    /// Apply a noise channel to a single qubit.
+    ///
+    /// For depolarizing noise with probability p:
+    ///   - With probability 1-p: do nothing
+    ///   - With probability p/3: apply X
+    ///   - With probability p/3: apply Y
+    ///   - With probability p/3: apply Z
+    ///
+    /// This stochastic Pauli insertion is exact for Pauli channels
+    /// and a good approximation for general noise (Pauli twirl).
+    pub fn apply_noise(
+        &mut self,
+        qubit: usize,
+        error_rate: f64,
+        noise_type: NoiseType,
+        rng: &mut impl Rng,
+    ) {
+        match noise_type {
+            NoiseType::BitFlip => {
+                if rng.gen::<f64>() < error_rate {
+                    self.apply_x(qubit);
+                }
+            }
+            NoiseType::PhaseFlip => {
+                if rng.gen::<f64>() < error_rate {
+                    self.apply_z(qubit);
+                }
+            }
+            NoiseType::Depolarizing => {
+                let r = rng.gen::<f64>();
+                if r < error_rate / 3.0 {
+                    self.apply_x(qubit);
+                } else if r < 2.0 * error_rate / 3.0 {
+                    self.apply_y(qubit);
+                } else if r < error_rate {
+                    self.apply_z(qubit);
+                }
+                // else: no error (identity)
+            }
+            NoiseType::Independent { p_x, p_z } => {
+                if rng.gen::<f64>() < p_x {
+                    self.apply_x(qubit);
+                }
+                if rng.gen::<f64>() < p_z {
+                    self.apply_z(qubit);
+                }
+            }
+        }
+    }
+
+    /// Apply idle noise to all data qubits.
+    ///
+    /// Called once per QEC cycle to model decoherence during idle periods.
+    pub fn apply_idle_noise(
+        &mut self,
+        data_qubits: &[usize],
+        noise: &NoiseModel,
+        rng: &mut impl Rng,
+    ) {
+        for &q in data_qubits {
+            self.apply_noise(q, noise.idle_error, noise.noise_type, rng);
+        }
+    }
+}
+```
+
+### 4. Syndrome Extraction Circuit
+
+A complete surface code syndrome extraction cycle consists of:
+
+1. Reset ancilla qubits to |0>
+2. Apply CNOT chains from data qubits to ancilla (stabilizer circuits)
+3. Measure ancilla qubits to extract syndrome bits
+4. (Optionally) apply noise after each gate
+
+```
+Syndrome Extraction for X-Stabilizer X0 = X_D0 * X_D1 * X_D3 * X_D4:
+
+  D0: ────────●───────────────────────────
+              │
+  D1: ────────┼──────●────────────────────
+              │      │
+  D3: ────────┼──────┼──────●─────────────
+              │      │      │
+  D4: ────────┼──────┼──────┼──────●──────
+              │      │      │      │
+  X0: ──|0>──[H]──CNOT──CNOT──CNOT──CNOT──[H]──[M]── syndrome bit
+
+  (For X-stabilizers: Hadamard on ancilla before and after CNOTs)
+  (For Z-stabilizers: CNOTs in opposite direction, no Hadamards)
+```
+
+```rust
+/// Surface code layout definition.
+pub struct SurfaceCodeLayout {
+    /// Code distance
+    pub distance: usize,
+    /// Data qubit indices
+    pub data_qubits: Vec<usize>,
+    /// X-stabilizer definitions: (ancilla_qubit, [data_qubits])
+    pub x_stabilizers: Vec<(usize, Vec<usize>)>,
+    /// Z-stabilizer definitions: (ancilla_qubit, [data_qubits])
+    pub z_stabilizers: Vec<(usize, Vec<usize>)>,
+    /// Total qubit count (data + ancilla)
+    pub total_qubits: usize,
+}
+
+impl SurfaceCodeLayout {
+    /// Generate a distance-d rotated surface code layout.
+    pub fn rotated(distance: usize) -> Self {
+        let n_data = distance * distance;
+        let n_x_stab = (distance * distance - 1) / 2;
+        let n_z_stab = (distance * distance - 1) / 2;
+        let total = n_data + n_x_stab + n_z_stab;
+
+        // Assign qubit indices:
+        // 0..n_data: data qubits
+        // n_data..n_data+n_x_stab: X-stabilizer ancillae
+        // n_data+n_x_stab..total: Z-stabilizer ancillae
+
+        let data_qubits: Vec<usize> = (0..n_data).collect();
+
+        // Build stabilizer mappings based on rotated surface code geometry
+        let (x_stabilizers, z_stabilizers) =
+            build_rotated_stabilizers(distance, n_data);
+
+        Self {
+            distance,
+            data_qubits,
+            x_stabilizers,
+            z_stabilizers,
+            total_qubits: total,
+        }
+    }
+}
+
+/// One complete syndrome extraction cycle.
+///
+/// Returns the syndrome bitstring (one bit per stabilizer).
+pub fn extract_syndrome(
+    state: &mut QuantumState,
+    layout: &SurfaceCodeLayout,
+    noise: &Option<NoiseModel>,
+    rng: &mut impl Rng,
+) -> SyndromeBits {
+    let mut syndrome = SyndromeBits::new(
+        layout.x_stabilizers.len() + layout.z_stabilizers.len()
+    );
+
+    // Step 1: Reset all ancilla qubits
+    for &(ancilla, _) in layout.x_stabilizers.iter()
+        .chain(layout.z_stabilizers.iter())
+    {
+        state.reset_qubit(ancilla);
+    }
+
+    // Step 2: X-stabilizer circuits
+    for (stab_idx, &(ancilla, ref data)) in layout.x_stabilizers.iter().enumerate() {
+        // Hadamard on ancilla (transforms Z-basis CNOT to X-basis measurement)
+        state.apply_hadamard(ancilla);
+        if let Some(ref n) = noise {
+            state.apply_noise(ancilla, n.single_qubit_error, n.noise_type, rng);
+        }
+
+        // CNOT from each data qubit to ancilla
+        for &d in data {
+            state.apply_cnot(d, ancilla);
+            if let Some(ref n) = noise {
+                state.apply_noise(d, n.two_qubit_error, n.noise_type, rng);
+                state.apply_noise(ancilla, n.two_qubit_error, n.noise_type, rng);
+            }
+        }
+
+        // Hadamard on ancilla
+        state.apply_hadamard(ancilla);
+        if let Some(ref n) = noise {
+            state.apply_noise(ancilla, n.single_qubit_error, n.noise_type, rng);
+        }
+
+        // Measure ancilla
+        let result = state.measure_qubit(ancilla, rng);
+
+        // Apply measurement error
+        let mut outcome = result.outcome;
+        if let Some(ref n) = noise {
+            if rng.gen::<f64>() < n.measurement_error {
+                outcome ^= 1; // Flip the classical bit
+            }
+        }
+
+        syndrome.set(stab_idx, outcome);
+    }
+
+    // Step 3: Z-stabilizer circuits
+    let offset = layout.x_stabilizers.len();
+    for (stab_idx, &(ancilla, ref data)) in layout.z_stabilizers.iter().enumerate() {
+        // No Hadamard for Z-stabilizers
+
+        // CNOT from ancilla to each data qubit
+        for &d in data {
+            state.apply_cnot(ancilla, d);
+            if let Some(ref n) = noise {
+                state.apply_noise(d, n.two_qubit_error, n.noise_type, rng);
+                state.apply_noise(ancilla, n.two_qubit_error, n.noise_type, rng);
+            }
+        }
+
+        // Measure ancilla
+        let result = state.measure_qubit(ancilla, rng);
+
+        let mut outcome = result.outcome;
+        if let Some(ref n) = noise {
+            if rng.gen::<f64>() < n.measurement_error {
+                outcome ^= 1;
+            }
+        }
+
+        syndrome.set(offset + stab_idx, outcome);
+    }
+
+    // Step 4: Apply idle noise to data qubits
+    if let Some(ref n) = noise {
+        state.apply_idle_noise(&layout.data_qubits, n, rng);
+    }
+
+    syndrome
+}
+```
+
+### 5. Decoder Integration
+
+The syndrome bits feed into ruQu's existing decoder infrastructure:
+
+```
+Decoder Pipeline:
+
+  Syndrome Bits ──> SyndromeFilter ──> MWPM Decoder ──> Correction ──> Apply to State
+        │                                    │
+        │                              ┌─────▼─────┐
+        │                              │ ruvector-  │
+        │                              │ mincut     │
+        └──────────────────────────────│ coherence  │
+                                       │ validation │
+                                       └────────────┘
+```
+
+```rust
+/// Decode syndrome and apply corrections.
+///
+/// This function bridges the quantum simulation (state vector) with
+/// ruQu's classical decoder infrastructure.
+pub fn decode_and_correct(
+    state: &mut QuantumState,
+    syndrome: &SyndromeBits,
+    layout: &SurfaceCodeLayout,
+    decoder: &mut MWPMDecoder,
+) -> DecoderResult {
+    // Convert syndrome bits to DetectorBitmap (ruQu format)
+    let mut bitmap = DetectorBitmap::new(syndrome.len());
+    for i in 0..syndrome.len() {
+        bitmap.set(i, syndrome.get(i) == 1);
+    }
+
+    // Decode using MWPM
+    let correction = decoder.decode(&bitmap);
+
+    // Apply X corrections to data qubits
+    for &qubit in &correction.x_corrections {
+        state.apply_x(qubit);
+    }
+
+    // Apply Z corrections to data qubits
+    for &qubit in &correction.z_corrections {
+        state.apply_z(qubit);
+    }
+
+    DecoderResult {
+        correction,
+        syndrome: bitmap,
+        applied: true,
+    }
+}
+```
+
+Integration with `ruvector-mincut` for coherence validation:
+
+```rust
+/// Validate decoder correction using min-cut coherence analysis.
+///
+/// Uses ruQu's existing DynamicMinCutEngine to assess whether the
+/// post-correction state maintains structural coherence.
+pub fn validate_correction(
+    syndrome: &SyndromeBits,
+    correction: &Correction,
+    mincut_engine: &mut DynamicMinCutEngine,
+) -> CoherenceAssessment {
+    // Update min-cut graph edges based on syndrome pattern
+    // High syndrome density in a region lowers edge weights (less coherent)
+    // Correction success restores edge weights
+
+    let cut_value = mincut_engine.query_min_cut();
+
+    CoherenceAssessment {
+        min_cut_value: cut_value.value,
+        is_coherent: cut_value.value > COHERENCE_THRESHOLD,
+        witness: cut_value.witness_hash,
+    }
+}
+```
+
+### 6. Logical Error Tracking
+
+To determine if a logical error has occurred, we compare the initial and final
+logical qubit states:
+
+```rust
+/// Track logical errors across QEC cycles.
+///
+/// A logical error occurs when the cumulative effect of physical errors
+/// and decoder corrections results in a non-trivial logical operator
+/// being applied to the encoded qubit.
+pub struct LogicalErrorTracker {
+    /// Accumulated X corrections on data qubits
+    x_correction_parity: Vec<bool>,
+    /// Accumulated Z corrections on data qubits
+    z_correction_parity: Vec<bool>,
+    /// Known physical X errors (for debugging/validation)
+    x_error_parity: Vec<bool>,
+    /// Known physical Z errors
+    z_error_parity: Vec<bool>,
+    /// Logical X operator support (which data qubits)
+    logical_x_support: Vec<usize>,
+    /// Logical Z operator support
+    logical_z_support: Vec<usize>,
+}
+
+impl LogicalErrorTracker {
+    /// Check if a logical X error has occurred.
+    ///
+    /// A logical X error occurs when the net X-type operator
+    /// (errors + corrections) has odd overlap with the logical Z operator.
+    pub fn has_logical_x_error(&self) -> bool {
+        let mut parity = false;
+        for &q in &self.logical_z_support {
+            parity ^= self.x_error_parity[q] ^ self.x_correction_parity[q];
+        }
+        parity
+    }
+
+    /// Check if a logical Z error has occurred.
+    pub fn has_logical_z_error(&self) -> bool {
+        let mut parity = false;
+        for &q in &self.logical_x_support {
+            parity ^= self.z_error_parity[q] ^ self.z_correction_parity[q];
+        }
+        parity
+    }
+
+    /// Check if any logical error has occurred.
+    pub fn has_logical_error(&self) -> bool {
+        self.has_logical_x_error() || self.has_logical_z_error()
+    }
+}
+```
+
+### 7. Full Surface Code Simulation Cycle
+
+Putting it all together, the complete simulation loop:
+
+```
+Full Surface Code QEC Cycle
+============================
+
+Input:  Code distance d, noise model, number of cycles T, decoder
+
+Output: Logical error rate estimate
+
+    layout = SurfaceCodeLayout::rotated(d)
+    state = QuantumState::new(layout.total_qubits)
+    tracker = LogicalErrorTracker::new(layout)
+    decoder = MWPMDecoder::new(d)
+    mincut = DynamicMinCutEngine::new()
+
+    // Prepare initial logical |0> state
+    prepare_logical_zero(&mut state, &layout)
+
+    for cycle in 0..T:
+        ┌─────────────────────────────────────────────────────┐
+        │  1. INJECT NOISE                                     │
+        │     Apply depolarizing noise to all data qubits      │
+        │     (models decoherence during idle + gate errors)   │
+        │     tracker.record_errors(noise_locations)            │
+        └─────────────────────────────────────────────────────┘
+                               │
+                               ▼
+        ┌─────────────────────────────────────────────────────┐
+        │  2. EXTRACT SYNDROME                                 │
+        │     Reset ancillae -> stabilizer circuits -> measure │
+        │     Returns syndrome bitstring for this cycle        │
+        └─────────────────────────────────────────────────────┘
+                               │
+                               ▼
+        ┌─────────────────────────────────────────────────────┐
+        │  3. DECODE                                           │
+        │     Feed syndrome to MWPM decoder                    │
+        │     Decoder returns correction (X and Z Pauli ops)   │
+        └─────────────────────────────────────────────────────┘
+                               │
+                               ▼
+        ┌─────────────────────────────────────────────────────┐
+        │  4. APPLY CORRECTION                                 │
+        │     Apply Pauli corrections to data qubits           │
+        │     tracker.record_corrections(corrections)          │
+        └─────────────────────────────────────────────────────┘
+                               │
+                               ▼
+        ┌─────────────────────────────────────────────────────┐
+        │  5. VALIDATE COHERENCE (optional)                    │
+        │     Run min-cut analysis on syndrome pattern         │
+        │     Flag if coherence drops below threshold          │
+        └─────────────────────────────────────────────────────┘
+
+    // After T cycles, check for logical error
+    logical_error = tracker.has_logical_error()
+```
+
+**Pseudocode for the full simulation**:
+
+```rust
+/// Run a complete surface code QEC simulation.
+///
+/// Returns the logical error rate estimated from `trials` independent runs,
+/// each consisting of `cycles` QEC rounds.
+pub fn simulate_surface_code(config: &SurfaceCodeConfig) -> SimulationResult {
+    let layout = SurfaceCodeLayout::rotated(config.distance);
+    let mut logical_errors = 0_u64;
+    let mut total_cycles = 0_u64;
+
+    for trial in 0..config.trials {
+        let mut state = QuantumState::new(layout.total_qubits);
+        let mut tracker = LogicalErrorTracker::new(&layout);
+        let mut decoder = MWPMDecoder::new(DecoderConfig {
+            distance: config.distance,
+            physical_error_rate: config.noise.idle_error,
+            ..Default::default()
+        });
+        let mut rng = StdRng::seed_from_u64(config.seed + trial);
+
+        // Prepare logical |0>
+        prepare_logical_zero(&mut state, &layout);
+
+        for cycle in 0..config.cycles {
+            // 1. Inject noise
+            inject_data_noise(&mut state, &layout, &config.noise, &mut rng);
+
+            // 2. Extract syndrome
+            let syndrome = extract_syndrome(
+                &mut state, &layout, &Some(config.noise.clone()), &mut rng
+            );
+
+            // 3. Decode
+            let correction = decoder.decode_syndrome(&syndrome);
+
+            // 4. Apply correction
+            apply_correction(&mut state, &correction);
+            tracker.record_correction(&correction);
+
+            total_cycles += 1;
+        }
+
+        // Check for logical error
+        if tracker.has_logical_error() {
+            logical_errors += 1;
+        }
+    }
+
+    let logical_error_rate = logical_errors as f64 / config.trials as f64;
+    let error_per_cycle = 1.0 - (1.0 - logical_error_rate)
+        .powf(1.0 / config.cycles as f64);
+
+    SimulationResult {
+        logical_error_rate,
+        logical_error_per_cycle: error_per_cycle,
+        total_trials: config.trials,
+        total_cycles,
+        logical_errors,
+        distance: config.distance,
+        physical_error_rate: config.noise.idle_error,
+    }
+}
+```
+
+### 8. Performance Estimates
+
+#### Distance-3 Surface Code
+
+| Parameter | Value |
+|-----------|-------|
+| Data qubits | 9 |
+| Ancilla qubits | 8 |
+| Total qubits | 17 |
+| State vector entries | 2^17 = 131,072 |
+| State vector memory | 2 MB |
+| CNOTs per cycle | ~16 (4 per stabilizer, 4 stabilizers active) |
+| Measurements per cycle | 8 |
+| Resets per cycle | 8 |
+| **Time per cycle** | **~0.5ms** |
+| **1000 cycles** | **~0.5s** |
+
+#### Distance-5 Surface Code
+
+| Parameter | Value |
+|-----------|-------|
+| Data qubits | 25 |
+| Ancilla qubits | 24 |
+| Total qubits | 49 |
+| State vector entries | 2^49 ~ 5.6 * 10^14 |
+| State vector memory | **4 PB** (infeasible for full state vector) |
+
+This highlights the fundamental scaling challenge: full state vector simulation of
+distance-5 surface codes requires stabilizer simulation or tensor network methods,
+not direct state vector evolution. However, for the critical distance-3 case, state
+vector simulation is fast and provides ground truth.
+
+**Practical simulation envelope**:
+
+| Distance | Qubits | State Vector | Feasible? | Cycles/sec |
+|----------|--------|-------------|-----------|------------|
+| 2 (toy) | 7 | 128 entries | Yes | ~50,000 |
+| 3 | 17 | 131K entries | Yes | ~2,000 |
+| 3 (with noise) | 17 | 131K entries | Yes | ~1,000 |
+| 4 | 31 | 2B entries | Marginal (16 GB) | ~0.1 |
+| 5+ | 49+ | >10^14 | No (state vector) | -- |
+
+For distance 5 and above, the implementation should fall back to **stabilizer
+simulation** (Gottesman-Knill theorem: Clifford circuits on stabilizer states can be
+simulated in polynomial time). Since surface code circuits consist entirely of Clifford
+gates (H, CNOT, S) with Pauli noise, this is a natural fit.
+
+### 9. Integration with Existing ruQu Pipeline
+
+The surface code simulation integrates with the full ruQu stack:
+
+```
+┌─────────────────────────────────────────────────────────────────────┐
+│                    ruQu QEC Simulation Stack                         │
+│                                                                       │
+│  ┌─────────────┐  ┌──────────────┐  ┌───────────────────────────┐   │
+│  │  State       │  │  Syndrome     │  │  Decoder Pipeline          │   │
+│  │  Vector      │  │  Processing   │  │                           │   │
+│  │  Engine      │──│  (syndrome.rs)│──│  SyndromeFilter           │   │
+│  │  (new)       │  │              │  │  ├── StructuralFilter      │   │
+│  │              │  │  DetectorBitmap  │  │  ├── ShiftFilter         │   │
+│  │  measure()   │  │  SyndromeBuffer │  │  ├── EvidenceFilter      │   │
+│  │  reset()     │  │  SyndromeDelta │  │  └── MWPM Decoder        │   │
+│  │  noise()     │  │              │  │      (decoder.rs)          │   │
+│  └─────────────┘  └──────────────┘  └───────────────────────────┘   │
+│         │                                        │                    │
+│         │              ┌─────────────────────────┘                    │
+│         │              │                                              │
+│         ▼              ▼                                              │
+│  ┌──────────────────────────┐  ┌────────────────────────────────┐   │
+│  │  Correction Application  │  │  Coherence Validation           │   │
+│  │                          │  │                                  │   │
+│  │  apply_x(qubit)         │  │  DynamicMinCutEngine             │   │
+│  │  apply_z(qubit)         │  │  (mincut.rs)                     │   │
+│  │                          │  │                                  │   │
+│  │  Logical Error Tracker   │  │  El-Hayek/Henzinger/Li          │   │
+│  └──────────────────────────┘  │  O(n^{o(1)}) min-cut            │   │
+│                                  └────────────────────────────────┘   │
+│                                                                       │
+│  ┌───────────────────────────────────────────────────────────────┐   │
+│  │  Tile Architecture (fabric.rs, tile.rs)                        │   │
+│  │                                                                 │   │
+│  │  TileZero (coordinator) + 255 WorkerTiles                      │   │
+│  │  Can parallelize across stabilizer groups for large codes      │   │
+│  └───────────────────────────────────────────────────────────────┘   │
+└─────────────────────────────────────────────────────────────────────┘
+```
+
+Key integration points:
+
+1. **Syndrome bits** from `measure_qubit()` are converted to `DetectorBitmap` format
+   for compatibility with ruQu's existing syndrome processing pipeline
+2. **MWPM decoder** from `decoder.rs` (backed by fusion-blossom) receives syndromes
+   and returns corrections
+3. **Min-cut coherence** from `mincut.rs` validates post-correction state quality
+4. **Tile architecture** from `fabric.rs` can distribute stabilizer measurements across
+   tiles for parallel processing of large codes
+5. **Stim integration** from `stim.rs` provides reference syndrome distributions for
+   decoder benchmarking
+
+### 10. Error Rate Estimation
+
+To estimate the error threshold, we run simulations at multiple physical error rates
+and code distances:
+
+```rust
+/// Estimate the error threshold by scanning physical error rates.
+///
+/// The threshold is the physical error rate p* at which logical error rate
+/// is independent of code distance. Below p*, larger codes are better.
+/// Above p*, larger codes are worse.
+pub fn estimate_threshold(
+    distances: &[usize],
+    error_rates: &[f64],
+    cycles_per_trial: usize,
+    trials: usize,
+) -> ThresholdResult {
+    let mut results = Vec::new();
+
+    for &d in distances {
+        for &p in error_rates {
+            let config = SurfaceCodeConfig {
+                distance: d,
+                noise: NoiseModel {
+                    idle_error: p,
+                    single_qubit_error: p / 10.0,
+                    two_qubit_error: p,
+                    measurement_error: p,
+                    noise_type: NoiseType::Depolarizing,
+                },
+                cycles: cycles_per_trial,
+                trials: trials as u64,
+                seed: 42,
+            };
+
+            let sim_result = simulate_surface_code(&config);
+            results.push((d, p, sim_result.logical_error_per_cycle));
+        }
+    }
+
+    // Find crossing point of d=3 and d=5 curves
+    find_threshold_crossing(&results)
+}
+```
+
+---
+
+## Consequences
+
+### Benefits
+
+1. **Full quantum state simulation** provides ground truth for decoder validation that
+   stabilizer simulation alone cannot (e.g., non-Clifford noise, leakage states)
+2. **Seamless integration** with ruQu's existing syndrome processing, MWPM decoder,
+   and min-cut coherence infrastructure minimizes new code and leverages battle-tested
+   components
+3. **Mid-circuit measurement** and qubit reset enable accurate simulation of the actual
+   hardware QEC cycle, not just the error model
+4. **Noise model flexibility** (bit-flip, phase-flip, depolarizing, independent) covers
+   the standard noise models used in QEC research
+5. **Logical error tracking** provides direct measurement of the quantity of interest
+   (logical error rate) without post-hoc analysis
+6. **Integration with min-cut coherence** validates that decoder corrections maintain
+   structural coherence, bridging ruQu's unique coherence-gating approach with standard
+   QEC metrics
+
+### Risks
+
+| Risk | Probability | Impact | Mitigation |
+|------|------------|--------|------------|
+| State vector memory limits simulation to d <= 3 | High | High | Stabilizer simulation fallback for d >= 5 |
+| Mid-circuit measurement breaks SIMD optimization | Medium | Medium | Separate hot/cold paths, measurement is infrequent |
+| Noise model too simplistic for real hardware | Medium | Medium | Support custom noise channels, correlated errors |
+| Decoder latency dominates simulation time | Low | Medium | Use streaming decoder, pre-built matching graphs |
+| Logical error tracking complexity for higher distance | Low | Low | Automate logical operator computation from layout |
+
+### Trade-offs
+
+| Decision | Advantage | Disadvantage |
+|----------|-----------|--------------|
+| State vector over stabilizer simulation | Handles arbitrary noise and non-Clifford ops | Exponential memory, limited to d <= 3-4 |
+| Stochastic Pauli insertion for noise | Simple, exact for Pauli channels | Approximate for non-Pauli noise |
+| Sequential ancilla measurement | Correct correlated outcomes | Cannot parallelize measurement step |
+| Integration with existing ruQu decoder | Reuses battle-tested code | Decoder API may not perfectly match simulation needs |
+| Coherent reset (amplitude transfer) | Preserves entanglement structure | More complex than incoherent reset |
+
+---
+
+## References
+
+- Fowler, A.G. et al. "Surface codes: Towards practical large-scale quantum computation." Physical Review A 86, 032324 (2012)
+- Dennis, E. et al. "Topological quantum memory." Journal of Mathematical Physics 43, 4452-4505 (2002)
+- Google Quantum AI. "Suppressing quantum errors by scaling a surface code logical qubit." Nature 614, 676-681 (2023)
+- Higgott, O. "PyMatching: A Python package for decoding quantum codes with minimum-weight perfect matching." ACM Transactions on Quantum Computing 3, 1-16 (2022)
+- Wu, Y. & Lin, H.H. "Hypergraph Decomposition and Secret Sharing." Discrete Applied Mathematics (2024)
+- ADR-001: ruQu Architecture - Classical Nervous System for Quantum Machines
+- ADR-QE-005: VQE Algorithm Support (quantum state manipulation, expectation values)
+- ADR-QE-006: Grover's Search (state vector operations, measurement)
+- ruQu syndrome module: `crates/ruQu/src/syndrome.rs` - DetectorBitmap, SyndromeBuffer
+- ruQu decoder module: `crates/ruQu/src/decoder.rs` - MWPMDecoder, fusion-blossom
+- ruQu mincut module: `crates/ruQu/src/mincut.rs` - DynamicMinCutEngine
+- ruQu filters module: `crates/ruQu/src/filters.rs` - Three-filter coherence pipeline
+- ruvector-mincut crate: `crates/ruvector-mincut/` - El-Hayek/Henzinger/Li algorithm
diff --git a/docs/adr/quantum-engine/ADR-QE-009-tensor-network-evaluation.md b/docs/adr/quantum-engine/ADR-QE-009-tensor-network-evaluation.md
new file mode 100644
index 00000000..360525ce
--- /dev/null
+++ b/docs/adr/quantum-engine/ADR-QE-009-tensor-network-evaluation.md
@@ -0,0 +1,480 @@
+# ADR-QE-009: Tensor Network Evaluation Mode
+
+**Status**: Proposed
+**Date**: 2026-02-06
+**Authors**: ruv.io, RuVector Team
+**Deciders**: Architecture Review Board
+
+---
+
+## Context
+
+Full state-vector simulation stores all 2^n complex amplitudes explicitly, yielding
+O(2^n) memory and O(G * 2^n) time for G gates. At n=30 this is 16 GiB; at n=40 it
+exceeds 16 TiB. Many practically interesting circuits, however, contain limited
+entanglement:
+
+| Circuit family | Entanglement structure | Treewidth |
+|---|---|---|
+| Shallow QAOA on sparse graphs | Bounded by graph degree | Low (often < 20) |
+| Separate-register circuits | Disjoint qubit subsets | Sum of sub-widths |
+| Near-Clifford circuits | Stabilizer + few T gates | Depends on T count |
+| 1D brickwork (finite depth) | Area-law entanglement | O(depth) |
+| Random deep circuits (all-to-all) | Volume-law entanglement | O(n) -- no gain |
+
+For the first four families, tensor network (TN) methods can trade increased
+computation for drastically reduced memory by representing each gate as a tensor and
+contracting the resulting network in an optimized order. The contraction cost scales
+exponentially in the *treewidth* of the circuit's line graph rather than in the total
+qubit count.
+
+QuantRS2 (the Rust quantum simulation reference) demonstrated tensor network
+contraction for circuits up to 60 qubits on commodity hardware when treewidth
+remained below ~25. ruVector's existing `ruvector-mincut` crate already solves graph
+partitioning problems that are structurally identical to contraction-order
+optimization, providing a natural integration point.
+
+The ruQu engine needs this capability to support:
+
+1. Surface code simulations at distance d >= 7 (49+ data qubits) for decoder
+   validation, where the syndrome extraction circuit is shallow and geometrically
+   local.
+2. Variational algorithm prototyping (VQE, QAOA) on graphs larger than 30 nodes.
+3. Hybrid workflows where part of the circuit is simulated via state vector and part
+   via tensor contraction.
+
+## Decision
+
+### 1. Feature-Gated Backend
+
+Tensor network evaluation is implemented as an optional backend behind the
+`tensor-network` feature flag in `ruqu-core`:
+
+```toml
+# ruqu-core/Cargo.toml
+[features]
+default = ["state-vector"]
+state-vector = []
+tensor-network = ["dep:ndarray", "dep:petgraph"]
+all-backends = ["state-vector", "tensor-network"]
+```
+
+When both backends are compiled in, the engine selects the backend at runtime based
+on circuit analysis (see Section 4 below).
+
+### 2. Tensor Representation
+
+Every gate becomes a tensor connecting the qubit wire indices it acts on:
+
+| Gate type | Tensor rank | Shape | Example |
+|---|---|---|---|
+| Single-qubit (H, X, Rz, ...) | 2 | [2, 2] | Input wire -> output wire |
+| Two-qubit (CNOT, CZ, ...) | 4 | [2, 2, 2, 2] | Two input wires -> two output wires |
+| Three-qubit (Toffoli) | 6 | [2, 2, 2, 2, 2, 2] | Three input -> three output |
+| Measurement projector | 2 | [2, 2] | Diagonal in computational basis |
+| Initial state |0> | 1 | [2] | Single output wire |
+
+The circuit is converted into a tensor network graph where:
+- Each tensor is a node.
+- Each shared index (qubit wire between consecutive gates) is an edge.
+- Open indices represent initial states and final measurement outcomes.
+
+```
+  |0>---[H]---[CNOT_ctrl]---[Rz]---<meas>
+                  |
+  |0>-----------[CNOT_tgt]---------<meas>
+```
+
+Becomes:
+
+```
+  Node: init_0 (rank 1)
+    |
+  Node: H_0 (rank 2)
+    |
+  Node: CNOT_01 (rank 4)
+   / \
+  |   Node: Rz_0 (rank 2)
+  |     |
+  |   Node: meas_0 (rank 2)
+  |
+  Node: init_1 (rank 1)
+    ... (connected via CNOT shared index)
+  Node: meas_1 (rank 2)
+```
+
+### 3. Contraction Strategy
+
+Contraction order determines whether the computation is tractable. The cost of
+contracting two tensors is the product of the dimensions of all indices involved.
+Finding the optimal contraction order is NP-hard (equivalent to finding minimum
+treewidth), so we use heuristics.
+
+#### Contraction Path Optimization Pseudocode
+
+```
+function find_contraction_path(tensor_network: TN) -> ContractionPath:
+    // Phase 1: Simplify the network
+    apply_trivial_contractions(tensor_network)  // rank-1 tensors, diagonal pairs
+
+    // Phase 2: Detect community structure
+    communities = detect_communities(tensor_network.graph)
+
+    // Phase 3: Contract within communities first (small subproblems)
+    intra_paths = []
+    for community in communities:
+        subgraph = tensor_network.subgraph(community)
+        if subgraph.num_tensors <= 20:
+            // Exact dynamic programming for small subgraphs
+            path = optimal_einsum_dp(subgraph)
+        else:
+            // Greedy with lookahead for larger subgraphs
+            path = greedy_with_lookahead(subgraph, lookahead=2)
+        intra_paths.append(path)
+
+    // Phase 4: Contract inter-community edges
+    // Each community is now a single large tensor
+    meta_graph = contract_communities(tensor_network, intra_paths)
+    inter_path = greedy_with_lookahead(meta_graph, lookahead=3)
+
+    // Phase 5: Compose the full path
+    return compose_paths(intra_paths, inter_path)
+
+
+function greedy_with_lookahead(tn: TN, lookahead: int) -> Path:
+    path = []
+    remaining = tn.clone()
+
+    while remaining.num_tensors > 1:
+        best_cost = INFINITY
+        best_pair = None
+
+        // Evaluate all candidate contractions
+        for (i, j) in remaining.candidate_pairs():
+            cost = contraction_cost(remaining, i, j)
+
+            // Lookahead: estimate cost of subsequent contractions
+            if lookahead > 0:
+                simulated = remaining.simulate_contraction(i, j)
+                future_cost = estimate_future_cost(simulated, lookahead - 1)
+                cost += future_cost * DISCOUNT_FACTOR
+
+            if cost < best_cost:
+                best_cost = cost
+                best_pair = (i, j)
+
+        path.append(best_pair)
+        remaining.contract(best_pair)
+
+    return path
+```
+
+#### Community Detection via ruvector-mincut
+
+The `ruvector-mincut` crate provides graph partitioning that is directly applicable
+to contraction ordering:
+
+```rust
+use ruvector_mincut::{partition, PartitionConfig};
+
+fn partition_tensor_network(tn: &TensorNetwork) -> Vec<Vec<TensorId>> {
+    let graph = tn.to_adjacency_graph();
+    let config = PartitionConfig {
+        num_partitions: estimate_optimal_partitions(tn),
+        balance_factor: 1.1,  // Allow 10% imbalance
+        minimize: Objective::EdgeCut,  // Minimize inter-partition wires
+    };
+    partition(&graph, &config)
+}
+```
+
+The edge cut directly corresponds to the bond dimension of the inter-community
+contraction, so minimizing edge cut minimizes the most expensive contraction step.
+
+### 4. MPS (Matrix Product State) Mode
+
+For circuits with 1D-like connectivity (nearest-neighbor gates on a line), a Matrix
+Product State representation is more efficient than general tensor contraction.
+
+```
+    A[1] -- A[2] -- A[3] -- ... -- A[n]
+     |       |       |               |
+   phys_1  phys_2  phys_3         phys_n
+```
+
+Each site tensor A[i] has shape `[bond_left, physical, bond_right]` where:
+- `physical` = 2 (qubit dimension)
+- `bond_left`, `bond_right` = bond dimension chi
+
+| Bond dimension (chi) | Memory per site | Total memory (n qubits) | Approximation |
+|---|---|---|---|
+| 1 | 16 bytes | 16n bytes | Product state only |
+| 16 | 4 KiB | 4n KiB | Low entanglement |
+| 64 | 64 KiB | 64n KiB | Moderate entanglement |
+| 256 | 1 MiB | n MiB | High entanglement |
+| 1024 | 16 MiB | 16n MiB | Near exact for many circuits |
+
+**Truncation policy**: After each two-qubit gate, perform SVD on the updated bond.
+If the bond dimension exceeds `chi_max`, truncate the smallest singular values.
+Track the total discarded weight (sum of squared discarded singular values) as a
+fidelity estimate:
+
+```rust
+pub struct MpsConfig {
+    /// Maximum bond dimension. Truncation occurs above this.
+    pub chi_max: usize,
+    /// Minimum singular value to retain (relative to largest).
+    pub svd_cutoff: f64,
+    /// Accumulated truncation error (updated during simulation).
+    pub fidelity_estimate: f64,
+}
+
+impl Default for MpsConfig {
+    fn default() -> Self {
+        Self {
+            chi_max: 256,
+            svd_cutoff: 1e-12,
+            fidelity_estimate: 1.0,
+        }
+    }
+}
+```
+
+### 5. Automatic Mode Selection
+
+The engine analyzes the circuit before execution to recommend a backend:
+
+```rust
+pub enum RecommendedBackend {
+    StateVector { reason: &'static str },
+    TensorNetwork { estimated_treewidth: usize, reason: &'static str },
+    Mps { estimated_max_bond: usize, reason: &'static str },
+}
+
+pub fn recommend_backend(circuit: &QuantumCircuit) -> RecommendedBackend {
+    let n = circuit.num_qubits();
+    let depth = circuit.depth();
+    let connectivity = circuit.connectivity_graph();
+
+    // Rule 1: Small circuits always use state vector
+    if n <= 20 {
+        return RecommendedBackend::StateVector {
+            reason: "Small circuit; state vector is fastest below 20 qubits",
+        };
+    }
+
+    // Rule 2: Check for 1D connectivity (MPS candidate)
+    if connectivity.max_degree() <= 2 && connectivity.is_path_graph() {
+        let estimated_bond = 2_usize.pow(depth.min(20) as u32);
+        return RecommendedBackend::Mps {
+            estimated_max_bond: estimated_bond,
+            reason: "1D nearest-neighbor connectivity detected",
+        };
+    }
+
+    // Rule 3: Estimate treewidth for general TN
+    let estimated_tw = estimate_treewidth(&connectivity, depth);
+    if estimated_tw < 25 && n > 25 {
+        return RecommendedBackend::TensorNetwork {
+            estimated_treewidth: estimated_tw,
+            reason: "Low treewidth relative to qubit count",
+        };
+    }
+
+    // Rule 4: Check memory feasibility for state vector
+    let sv_memory = 16 * (1_usize << n);  // bytes
+    let available = estimate_available_memory();
+    if sv_memory > available {
+        // Force TN even if treewidth is high -- at least it has a chance
+        return RecommendedBackend::TensorNetwork {
+            estimated_treewidth: estimated_tw,
+            reason: "State vector exceeds available memory; TN is only option",
+        };
+    }
+
+    RecommendedBackend::StateVector {
+        reason: "High treewidth circuit; state vector is more efficient",
+    }
+}
+```
+
+### 6. When Tensor Networks Win vs Lose
+
+**Tensor networks win when:**
+
+| Scenario | Why TN wins | Example |
+|---|---|---|
+| Shallow circuits on many qubits | Treewidth ~ depth, not n | 50-qubit depth-4 QAOA |
+| Sparse graph connectivity | Low treewidth from graph structure | MaxCut on 3-regular graph |
+| Separate registers | Independent contractions | n/2 Bell pairs |
+| Near-Clifford | Stabilizer + few non-Clifford gates | Clifford + 5 T gates |
+| Amplitude computation | Contract to single output, not full state | Sampling one bitstring |
+
+**Tensor networks lose when:**
+
+| Scenario | Why TN loses | Fallback |
+|---|---|---|
+| Deep random circuits | Treewidth ~ n | State vector (if n <= 30) |
+| All-to-all connectivity | No structure to exploit | State vector |
+| Full state tomography needed | Must contract once per amplitude | State vector |
+| Very small circuits (n < 20) | Overhead exceeds state vector | State vector |
+| High-fidelity MPS needed | Bond dimension grows exponentially | State vector or exact TN |
+
+### 7. Example: 50-Qubit Shallow QAOA
+
+Consider QAOA depth p=1 on a 50-node 3-regular graph:
+
+```
+Circuit structure:
+  - 50 qubits, initialized to |+>
+  - 75 ZZ gates (one per edge), parameterized by gamma
+  - 50 Rx gates, parameterized by beta
+  - Total: 125 + 50 = 175 gates
+  - Circuit depth: 4 (H layer, ZZ layer (3-colorable), Rx layer, measure)
+
+Graph treewidth of 3-regular graph: typically 8-15
+
+Tensor network contraction:
+  - Community detection finds ~5-8 communities of 6-10 nodes
+  - Intra-community contraction: O(2^10) ~ 1024 per community
+  - Inter-community bonds: ~15 edges cut
+  - Effective contraction complexity: O(2^15) = 32768
+  - Compare to state vector: O(2^50) = 1.1 * 10^15
+
+Memory comparison:
+  - State vector: 2^50 * 16 bytes = 16 PiB (impossible)
+  - Tensor network: ~100 MiB working memory
+  - Speedup factor: practically infinite (feasible vs infeasible)
+```
+
+```
+Contraction Diagram (simplified):
+
+  Community A        Community B        Community C
+  [q0-q9]           [q10-q19]          [q20-q29]
+     |                  |                   |
+     +--- bond=2^3 ----+---- bond=2^4 -----+
+                        |
+  Community D        Community E
+  [q30-q39]          [q40-q49]
+     |                  |
+     +--- bond=2^3 ----+
+
+  Peak intermediate tensor: 2^15 elements = 512 KiB
+```
+
+### 8. Integration with State Vector Backend
+
+Both backends implement the same trait:
+
+```rust
+pub trait SimulationBackend {
+    /// Execute the circuit and return measurement results.
+    fn execute(
+        &self,
+        circuit: &QuantumCircuit,
+        shots: usize,
+        config: &SimulationConfig,
+    ) -> Result<SimulationResult, SimulationError>;
+
+    /// Compute expectation value of an observable.
+    fn expectation_value(
+        &self,
+        circuit: &QuantumCircuit,
+        observable: &Observable,
+        config: &SimulationConfig,
+    ) -> Result<f64, SimulationError>;
+
+    /// Return the backend name for logging.
+    fn name(&self) -> &'static str;
+}
+```
+
+Users interact through `QuantumCircuit` and never need to know which backend is
+active:
+
+```rust
+let circuit = QuantumCircuit::new(50)
+    .h_all()
+    .append_qaoa_layer(graph, gamma, beta)
+    .measure_all();
+
+// Automatic backend selection
+let result = ruqu::execute(&circuit, 1000)?;
+// -> Internally selects TensorNetwork backend due to n=50, low treewidth
+
+// Or explicit backend override
+let result = ruqu::execute_with_backend(
+    &circuit,
+    1000,
+    Backend::TensorNetwork(TnConfig::default()),
+)?;
+```
+
+### 9. Future: ruvector-mincut Integration for Contraction Ordering
+
+The `ruvector-mincut` crate currently solves balanced graph partitioning for vector
+index sharding. The same algorithm directly applies to tensor network contraction
+ordering via the following correspondence:
+
+| Graph partitioning concept | TN contraction concept |
+|---|---|
+| Vertex | Tensor |
+| Edge weight | Bond dimension (log2) |
+| Partition | Contraction subtree |
+| Edge cut | Inter-partition bond cost |
+| Balanced partition | Balanced contraction tree |
+
+Phase 1 (this ADR): Use `ruvector-mincut` for community detection in contraction
+path optimization.
+
+Phase 2 (future): Extend `ruvector-mincut` with hypergraph partitioning for
+multi-index tensor contractions, enabling handling of higher-order tensor networks
+(e.g., PEPS for 2D circuits).
+
+## Consequences
+
+### Positive
+
+1. **Dramatically expanded qubit range**: Shallow circuits on 40-60 qubits become
+   tractable on commodity hardware.
+2. **Surface code simulation**: Distance-7 surface codes (49 data + 48 ancilla = 97
+   qubits) can be simulated for decoder validation using MPS (the circuit is
+   geometrically local).
+3. **Unified interface**: Users write circuits once; backend selection is automatic.
+4. **Synergy with ruvector-mincut**: Leverages existing graph partitioning
+   investment.
+5. **Complementary to state vector**: Each backend covers the other's weakness.
+
+### Negative
+
+1. **Implementation complexity**: Tensor contraction, SVD truncation, and path
+   optimization are non-trivial to implement correctly and efficiently.
+2. **Approximation risk**: MPS truncation introduces controlled but nonzero error.
+   Users must understand fidelity estimates.
+3. **Compilation time**: The `ndarray` and `petgraph` dependencies add to compile
+   time when the feature is enabled.
+4. **Testing surface**: Two backends doubles the testing matrix for correctness
+   validation.
+5. **Performance unpredictability**: Contraction cost depends on circuit structure
+   in ways that are hard to predict without running the path optimizer.
+
+### Risks and Mitigations
+
+| Risk | Likelihood | Impact | Mitigation |
+|---|---|---|---|
+| Path optimizer finds poor ordering | Medium | High cost | Multiple heuristics + timeout fallback to greedy |
+| MPS fidelity silently degrades | Medium | Incorrect results | Track discarded weight; warn if fidelity < 0.99 |
+| Feature interaction bugs | Low | Incorrect results | Shared test suite: both backends must agree on small circuits |
+| Memory spike during contraction | Medium | OOM | Pre-estimate peak intermediate tensor size; abort if too large |
+
+## References
+
+- QuantRS2 tensor network implementation: internal reference
+- Markov & Shi, "Simulating Quantum Computation by Contracting Tensor Networks" (2008)
+- Gray & Kourtis, "Hyper-optimized tensor network contraction" (2021) -- cotengra
+- Schollwock, "The density-matrix renormalization group in the age of matrix product states" (2011)
+- ADR-QE-001: Core Engine Architecture (state vector backend)
+- ADR-QE-005: WASM Compilation Target
+- `ruvector-mincut` crate documentation
+- ADR-014: Coherence Engine (graph partitioning reuse)
diff --git a/docs/adr/quantum-engine/ADR-QE-010-observability-monitoring.md b/docs/adr/quantum-engine/ADR-QE-010-observability-monitoring.md
new file mode 100644
index 00000000..95781725
--- /dev/null
+++ b/docs/adr/quantum-engine/ADR-QE-010-observability-monitoring.md
@@ -0,0 +1,689 @@
+# ADR-QE-010: Observability & Monitoring Integration
+
+**Status**: Proposed
+**Date**: 2026-02-06
+**Authors**: ruv.io, RuVector Team
+**Deciders**: Architecture Review Board
+
+---
+
+## Context
+
+ruVector provides comprehensive observability through the `ruvector-metrics` crate,
+which aggregates telemetry from all subsystems into a unified monitoring dashboard.
+The quantum simulation engine is a new subsystem that must participate in this
+observability infrastructure.
+
+Effective monitoring of quantum simulation is essential for:
+
+1. **Performance tuning**: Identifying bottlenecks in gate application, memory
+   allocation, and parallelization efficiency.
+2. **Resource management**: Tracking memory consumption to prevent OOM conditions
+   and to inform auto-scaling decisions.
+3. **Debugging**: Tracing the execution of specific circuits to diagnose incorrect
+   results or unexpected behavior.
+4. **Capacity planning**: Understanding workload patterns (qubit counts, circuit
+   depths, simulation frequency) to plan infrastructure.
+5. **Compliance**: Auditable logs of simulation executions for regulated
+   environments (cryptographic validation, safety-critical applications).
+
+### WASM Constraint
+
+In WebAssembly deployment, there is no direct filesystem access and no native
+networking. Observability in WASM must use browser-compatible mechanisms:
+`console.log`, `console.warn`, `console.error`, or JavaScript callback functions
+registered by the host application.
+
+### Existing Infrastructure
+
+| Component | Role | Integration Point |
+|---|---|---|
+| `ruvector-metrics` | Metrics aggregation and export | Trait-based sink |
+| `ruvector-monitor` | Real-time dashboard UI | WebSocket feed |
+| Rust `tracing` crate | Structured logging and spans | Subscriber-based |
+| Prometheus / OpenTelemetry | External monitoring | Exporter plugins |
+| Ed25519 audit trail | Cryptographic logging | `ruqu-audit` crate |
+
+## Decision
+
+### 1. Metrics Schema
+
+Every simulation execution emits a structured metrics record. The schema is
+versioned to allow evolution without breaking consumers.
+
+```rust
+/// Metrics emitted after each quantum simulation execution.
+/// Schema version: 1.0.0
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct SimulationMetrics {
+    /// Schema version for forward compatibility.
+    pub schema_version: &'static str,
+
+    /// Unique identifier for this simulation run.
+    pub simulation_id: Uuid,
+
+    /// Timestamp when simulation started (UTC).
+    pub started_at: DateTime<Utc>,
+
+    /// Timestamp when simulation completed (UTC).
+    pub completed_at: DateTime<Utc>,
+
+    // -- Circuit characteristics --
+
+    /// Number of qubits in the circuit.
+    pub qubit_count: u32,
+
+    /// Total number of gates (before optimization).
+    pub gate_count_raw: u64,
+
+    /// Total number of gates (after optimization/fusion).
+    pub gate_count_optimized: u64,
+
+    /// Circuit depth (longest path from input to output).
+    pub circuit_depth: u32,
+
+    /// Number of two-qubit gates (entangling operations).
+    pub two_qubit_gate_count: u64,
+
+    // -- Execution metrics --
+
+    /// Total wall-clock execution time in milliseconds.
+    pub execution_time_ms: f64,
+
+    /// Time spent in gate application (excluding allocation, measurement).
+    pub gate_application_time_ms: f64,
+
+    /// Time spent in measurement sampling.
+    pub measurement_time_ms: f64,
+
+    /// Peak memory consumption in bytes during simulation.
+    pub peak_memory_bytes: u64,
+
+    /// Memory allocated for the state vector / tensor network.
+    pub state_memory_bytes: u64,
+
+    /// Backend used for this simulation.
+    pub backend: BackendType,
+
+    // -- Throughput --
+
+    /// Gates applied per second (optimized gate count / gate application time).
+    pub gates_per_second: f64,
+
+    /// Qubits * depth per second (a normalized throughput metric).
+    pub quantum_volume_rate: f64,
+
+    // -- Optimization statistics --
+
+    /// Number of gates eliminated by fusion.
+    pub gates_fused: u64,
+
+    /// Number of gates eliminated as identity or redundant.
+    pub gates_skipped: u64,
+
+    /// Number of gate commutations applied.
+    pub gates_commuted: u64,
+
+    // -- Entanglement analysis --
+
+    /// Number of independent qubit subsets (entanglement groups).
+    pub entanglement_groups: u32,
+
+    /// Sizes of each entanglement group.
+    pub entanglement_group_sizes: Vec<u32>,
+
+    // -- Measurement outcomes (if measured) --
+
+    /// Number of measurement shots executed.
+    pub measurement_shots: Option<u64>,
+
+    /// Distribution entropy of measurement outcomes (bits).
+    pub outcome_entropy: Option<f64>,
+
+    // -- MPS-specific (tensor network backend) --
+
+    /// Maximum bond dimension reached (MPS mode only).
+    pub max_bond_dimension: Option<u32>,
+
+    /// Estimated fidelity after MPS truncation.
+    pub mps_fidelity_estimate: Option<f64>,
+
+    // -- Error information --
+
+    /// Whether the simulation completed successfully.
+    pub success: bool,
+
+    /// Error message if simulation failed.
+    pub error: Option<String>,
+
+    /// Error category for programmatic handling.
+    pub error_kind: Option<SimulationErrorKind>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum BackendType {
+    StateVector,
+    TensorNetwork,
+    Mps,
+    Hybrid,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum SimulationErrorKind {
+    QubitLimitExceeded,
+    MemoryAllocationFailed,
+    InvalidGateTarget,
+    InvalidParameter,
+    ContractionFailed,
+    MpsFidelityBelowThreshold,
+    Timeout,
+    InternalError,
+}
+```
+
+### 2. Metrics Sink Trait
+
+The engine publishes metrics through a trait abstraction, allowing different sinks
+for native and WASM environments:
+
+```rust
+/// Trait for consuming simulation metrics.
+/// Implementations exist for native (ruvector-metrics), WASM (JS callback),
+/// and testing (in-memory collector).
+pub trait MetricsSink: Send + Sync {
+    /// Publish a completed simulation's metrics.
+    fn publish(&self, metrics: &SimulationMetrics);
+
+    /// Publish an incremental progress update (for long-running simulations).
+    fn progress(&self, simulation_id: Uuid, percent_complete: f32, message: &str);
+
+    /// Publish a health status update.
+    fn health(&self, status: EngineHealthStatus);
+}
+
+/// Native implementation: forwards to ruvector-metrics.
+pub struct NativeMetricsSink {
+    registry: Arc<ruvector_metrics::Registry>,
+}
+
+impl MetricsSink for NativeMetricsSink {
+    fn publish(&self, metrics: &SimulationMetrics) {
+        // Emit as histogram/counter/gauge values
+        self.registry.histogram("ruqu.execution_time_ms")
+            .record(metrics.execution_time_ms);
+        self.registry.gauge("ruqu.peak_memory_bytes")
+            .set(metrics.peak_memory_bytes as f64);
+        self.registry.counter("ruqu.simulations_total")
+            .increment(1);
+        self.registry.counter("ruqu.gates_applied_total")
+            .increment(metrics.gate_count_optimized);
+        self.registry.histogram("ruqu.gates_per_second")
+            .record(metrics.gates_per_second);
+
+        if !metrics.success {
+            self.registry.counter("ruqu.errors_total")
+                .increment(1);
+        }
+    }
+
+    fn progress(&self, _id: Uuid, percent: f32, _msg: &str) {
+        self.registry.gauge("ruqu.current_progress")
+            .set(percent as f64);
+    }
+
+    fn health(&self, status: EngineHealthStatus) {
+        self.registry.gauge("ruqu.health_status")
+            .set(status.as_numeric());
+    }
+}
+```
+
+### 3. WASM Metrics Sink
+
+In WASM, metrics are delivered via JavaScript callbacks:
+
+```rust
+#[cfg(target_arch = "wasm32")]
+pub struct WasmMetricsSink {
+    /// JS callback function registered by host application.
+    callback: js_sys::Function,
+}
+
+#[cfg(target_arch = "wasm32")]
+impl MetricsSink for WasmMetricsSink {
+    fn publish(&self, metrics: &SimulationMetrics) {
+        let json = serde_json::to_string(metrics)
+            .unwrap_or_else(|_| "{}".to_string());
+        let js_value = JsValue::from_str(&json);
+        let event_type = JsValue::from_str("simulation_complete");
+        let _ = self.callback.call2(&JsValue::NULL, &event_type, &js_value);
+    }
+
+    fn progress(&self, id: Uuid, percent: f32, message: &str) {
+        let payload = format!(
+            r#"{{"simulation_id":"{}","percent":{},"message":"{}"}}"#,
+            id, percent, message
+        );
+        let js_value = JsValue::from_str(&payload);
+        let event_type = JsValue::from_str("simulation_progress");
+        let _ = self.callback.call2(&JsValue::NULL, &event_type, &js_value);
+    }
+
+    fn health(&self, status: EngineHealthStatus) {
+        let payload = format!(r#"{{"status":"{}"}}"#, status.as_str());
+        let js_value = JsValue::from_str(&payload);
+        let event_type = JsValue::from_str("engine_health");
+        let _ = self.callback.call2(&JsValue::NULL, &event_type, &js_value);
+    }
+}
+```
+
+JavaScript host registration:
+
+```javascript
+// Host application registers the metrics callback
+import init, { set_metrics_callback } from 'ruqu-wasm';
+
+await init();
+
+set_metrics_callback((eventType, data) => {
+    const metrics = JSON.parse(data);
+    switch (eventType) {
+        case 'simulation_complete':
+            console.log(`Simulation ${metrics.simulation_id} completed in ${metrics.execution_time_ms}ms`);
+            dashboard.updateMetrics(metrics);
+            break;
+        case 'simulation_progress':
+            progressBar.update(metrics.percent);
+            break;
+        case 'engine_health':
+            healthIndicator.set(metrics.status);
+            break;
+    }
+});
+```
+
+### 4. Tracing Integration
+
+The engine integrates with the Rust `tracing` crate for structured logging and
+distributed tracing.
+
+#### Span Hierarchy
+
+```
+ruqu::simulation                          (root span for entire simulation)
+  |
+  +-- ruqu::circuit_validation            (validate circuit structure)
+  |
+  +-- ruqu::backend_selection             (automatic backend choice)
+  |
+  +-- ruqu::optimization                  (gate fusion, commutation, etc.)
+  |     |
+  |     +-- ruqu::optimization::fusion    (individual fusion passes)
+  |     +-- ruqu::optimization::cancel    (gate cancellation)
+  |
+  +-- ruqu::state_init                    (allocate and initialize state)
+  |
+  +-- ruqu::gate_application              (apply all gates)
+  |     |
+  |     +-- ruqu::gate                    (individual gate -- DEBUG level only)
+  |
+  +-- ruqu::measurement                   (perform measurement sampling)
+  |
+  +-- ruqu::metrics_publish               (emit metrics to sink)
+  |
+  +-- ruqu::state_cleanup                 (deallocate state vector)
+```
+
+#### Instrumentation Code
+
+```rust
+use tracing::{info, warn, debug, trace, instrument, Span};
+
+#[instrument(
+    name = "ruqu::simulation",
+    skip(circuit, config, metrics_sink),
+    fields(
+        qubit_count = circuit.num_qubits(),
+        gate_count = circuit.gate_count(),
+        simulation_id = %Uuid::new_v4(),
+    )
+)]
+pub fn execute(
+    circuit: &QuantumCircuit,
+    shots: usize,
+    config: &SimulationConfig,
+    metrics_sink: &dyn MetricsSink,
+) -> Result<SimulationResult, SimulationError> {
+    info!(
+        qubits = circuit.num_qubits(),
+        gates = circuit.gate_count(),
+        depth = circuit.depth(),
+        shots = shots,
+        "Starting quantum simulation"
+    );
+
+    // Validate
+    let _validation_span = tracing::info_span!("ruqu::circuit_validation").entered();
+    validate_circuit(circuit)?;
+    drop(_validation_span);
+
+    // Select backend
+    let _backend_span = tracing::info_span!("ruqu::backend_selection").entered();
+    let backend = select_backend(circuit, config);
+    info!(backend = backend.name(), "Backend selected");
+    drop(_backend_span);
+
+    // Optimize
+    let _opt_span = tracing::info_span!("ruqu::optimization").entered();
+    let optimized = optimize_circuit(circuit, config)?;
+    info!(
+        original_gates = circuit.gate_count(),
+        optimized_gates = optimized.gate_count(),
+        gates_fused = circuit.gate_count() - optimized.gate_count(),
+        "Circuit optimization complete"
+    );
+    drop(_opt_span);
+
+    // Execute
+    let result = backend.execute(&optimized, shots, config)?;
+
+    // At DEBUG level, log per-gate details
+    debug!(
+        execution_time_ms = result.execution_time_ms,
+        peak_memory = result.peak_memory_bytes,
+        "Simulation execution complete"
+    );
+
+    // At TRACE level only for small circuits, log amplitude information
+    if circuit.num_qubits() <= 10 {
+        trace!(
+            amplitudes = ?result.state_vector_snapshot(),
+            "Final state vector (small circuit trace)"
+        );
+    }
+
+    Ok(result)
+}
+```
+
+### 5. Structured Error Reporting
+
+All errors carry structured context for programmatic handling:
+
+```rust
+#[derive(Debug, thiserror::Error)]
+pub enum SimulationError {
+    #[error("Qubit limit exceeded: requested {requested}, maximum {maximum}")]
+    QubitLimitExceeded {
+        requested: u32,
+        maximum: u32,
+        estimated_memory_bytes: u64,
+        available_memory_bytes: u64,
+    },
+
+    #[error("Memory allocation failed for {requested_bytes} bytes")]
+    MemoryAllocationFailed {
+        requested_bytes: u64,
+        qubit_count: u32,
+        suggestion: &'static str,
+    },
+
+    #[error("Invalid gate target: qubit {qubit} in {qubit_count}-qubit circuit")]
+    InvalidGateTarget {
+        gate_name: String,
+        qubit: u32,
+        qubit_count: u32,
+        gate_index: usize,
+    },
+
+    #[error("Invalid gate parameter: {parameter_name} = {value} ({reason})")]
+    InvalidParameter {
+        gate_name: String,
+        parameter_name: String,
+        value: f64,
+        reason: &'static str,
+    },
+
+    #[error("Tensor contraction failed: {reason}")]
+    ContractionFailed {
+        reason: String,
+        estimated_treewidth: usize,
+        suggestion: &'static str,
+    },
+
+    #[error("MPS fidelity {fidelity:.6} below threshold {threshold:.6}")]
+    MpsFidelityBelowThreshold {
+        fidelity: f64,
+        threshold: f64,
+        max_bond_dimension: usize,
+        suggestion: &'static str,
+    },
+
+    #[error("Simulation timed out after {elapsed_ms}ms (limit: {timeout_ms}ms)")]
+    Timeout {
+        elapsed_ms: u64,
+        timeout_ms: u64,
+        gates_completed: u64,
+        gates_remaining: u64,
+    },
+
+    #[error("Internal error: {message}")]
+    InternalError {
+        message: String,
+        source: Option<Box<dyn std::error::Error + Send + Sync>>,
+    },
+}
+```
+
+Each error variant includes a `suggestion` field where applicable, guiding users
+toward resolution:
+
+| Error | Suggestion |
+|---|---|
+| QubitLimitExceeded | "Reduce qubit count or enable tensor-network feature for large circuits" |
+| MemoryAllocationFailed | "Try tensor-network backend or reduce qubit count by 1-2 (halves/quarters memory)" |
+| ContractionFailed | "Circuit treewidth too high for tensor network; use state vector for <= 30 qubits" |
+| MpsFidelityBelowThreshold | "Increase chi_max or switch to exact state vector for high-fidelity results" |
+
+### 6. Health Checks
+
+The engine exposes health status for monitoring systems:
+
+```rust
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct EngineHealthStatus {
+    /// Whether the engine is ready to accept simulations.
+    pub ready: bool,
+
+    /// Maximum qubits supportable given current available memory.
+    pub max_supported_qubits: u32,
+
+    /// Available memory in bytes.
+    pub available_memory_bytes: u64,
+
+    /// Number of CPU cores available for parallel gate application.
+    pub available_cores: usize,
+
+    /// Whether the tensor-network backend is compiled in.
+    pub tensor_network_available: bool,
+
+    /// Current engine version.
+    pub version: &'static str,
+
+    /// Uptime since engine initialization (if applicable).
+    pub uptime_seconds: Option<f64>,
+
+    /// Number of simulations executed in current session.
+    pub simulations_executed: u64,
+
+    /// Total gates applied across all simulations in current session.
+    pub total_gates_applied: u64,
+}
+
+/// Check engine health. Callable at any time.
+pub fn quantum_engine_ready() -> EngineHealthStatus {
+    let available_memory = estimate_available_memory();
+    let max_qubits = compute_max_qubits(available_memory);
+
+    EngineHealthStatus {
+        ready: max_qubits >= 4,  // Minimum useful simulation
+        max_supported_qubits: max_qubits,
+        available_memory_bytes: available_memory,
+        available_cores: rayon::current_num_threads(),
+        tensor_network_available: cfg!(feature = "tensor-network"),
+        version: env!("CARGO_PKG_VERSION"),
+        uptime_seconds: None,  // Library mode; no persistent uptime
+        simulations_executed: SESSION_COUNTER.load(Ordering::Relaxed),
+        total_gates_applied: SESSION_GATES.load(Ordering::Relaxed),
+    }
+}
+```
+
+### 7. Logging Levels
+
+| Level | Content | Audience | Performance Impact |
+|---|---|---|---|
+| ERROR | Simulation failures, OOM, invalid circuits | Operators, alerting | None |
+| WARN | Approaching memory limits (>80%), MPS fidelity degradation, slow contraction | Operators | Negligible |
+| INFO | Simulation start/end summaries, backend selection, optimization results | Developers, dashboards | Negligible |
+| DEBUG | Per-optimization-pass details, memory allocation sizes, thread utilization | Developers debugging | Low |
+| TRACE | Per-gate amplitude changes (small circuits only, n <= 10), SVD singular values | Deep debugging | High (small circuits only) |
+
+TRACE level is gated on circuit size to prevent catastrophic log volume:
+
+```rust
+// TRACE-level amplitude logging is only emitted for circuits with <= 10 qubits.
+// For larger circuits, TRACE only emits gate-level timing without amplitude data.
+if tracing::enabled!(tracing::Level::TRACE) {
+    if circuit.num_qubits() <= 10 {
+        trace!(amplitudes = ?state.as_slice(), "Post-gate state");
+    } else {
+        trace!(gate_time_ns = elapsed.as_nanos(), "Gate applied");
+    }
+}
+```
+
+### 8. Dashboard Integration
+
+Metrics from the quantum engine appear in the ruVector monitoring UI as a dedicated
+panel alongside vector operations, index health, and system resources.
+
+```
++------------------------------------------------------------------+
+|                    ruVector Monitoring Dashboard                   |
++------------------------------------------------------------------+
+|                                                                    |
+|  Vector Operations          |  Quantum Simulations                |
+|  -------------------        |  -----------------------            |
+|  Queries/sec: 12,450        |  Simulations/min: 23                |
+|  P99 latency: 2.3ms         |  Avg execution: 145ms               |
+|  Index size: 2.1M vectors   |  Avg qubits: 18.4                  |
+|                              |  Peak memory: 4.2 GiB              |
+|                              |  Backend: SV 87% / TN 13%         |
+|                              |  Gates/sec: 2.1B                   |
+|                              |  Error rate: 0.02%                 |
+|                              |                                    |
+|  System Resources           |  Recent Simulations                |
+|  -------------------        |  -----------------------            |
+|  CPU: 34%                   |  #a3f2.. 24q  230ms  OK           |
+|  Memory: 61% (49/80 GiB)   |  #b891.. 16q   12ms  OK           |
+|  Threads: 64/256 active     |  #c4d0.. 30q 1.2s   OK           |
+|                              |  #d122.. 35q  ERR   OOM          |
++------------------------------------------------------------------+
+```
+
+Metrics are published via the existing `ruvector-metrics` WebSocket feed:
+
+```json
+{
+    "source": "ruqu",
+    "type": "simulation_complete",
+    "timestamp": "2026-02-06T14:23:01.442Z",
+    "data": {
+        "simulation_id": "a3f2e891-...",
+        "qubit_count": 24,
+        "execution_time_ms": 230.4,
+        "peak_memory_bytes": 268435456,
+        "backend": "StateVector",
+        "gates_per_second": 2147483648,
+        "success": true
+    }
+}
+```
+
+### 9. Prometheus / OpenTelemetry Export
+
+For external monitoring, the native metrics sink exports standard Prometheus
+metrics:
+
+```
+# HELP ruqu_simulations_total Total quantum simulations executed
+# TYPE ruqu_simulations_total counter
+ruqu_simulations_total{backend="state_vector",status="success"} 1847
+ruqu_simulations_total{backend="state_vector",status="error"} 3
+ruqu_simulations_total{backend="tensor_network",status="success"} 241
+
+# HELP ruqu_execution_time_ms Simulation execution time histogram
+# TYPE ruqu_execution_time_ms histogram
+ruqu_execution_time_ms_bucket{backend="state_vector",le="10"} 423
+ruqu_execution_time_ms_bucket{backend="state_vector",le="100"} 1201
+ruqu_execution_time_ms_bucket{backend="state_vector",le="1000"} 1834
+ruqu_execution_time_ms_bucket{backend="state_vector",le="+Inf"} 1847
+
+# HELP ruqu_peak_memory_bytes Peak memory during simulation
+# TYPE ruqu_peak_memory_bytes gauge
+ruqu_peak_memory_bytes 4294967296
+
+# HELP ruqu_gates_per_second Gate application throughput
+# TYPE ruqu_gates_per_second gauge
+ruqu_gates_per_second 2.1e9
+
+# HELP ruqu_max_supported_qubits Maximum qubits based on available memory
+# TYPE ruqu_max_supported_qubits gauge
+ruqu_max_supported_qubits 33
+```
+
+## Consequences
+
+### Positive
+
+1. **Unified observability**: Quantum simulation telemetry integrates seamlessly
+   with ruVector's existing monitoring infrastructure.
+2. **Cross-platform**: The trait-based sink design supports native, WASM, and
+   testing environments without code changes in the engine.
+3. **Actionable errors**: Structured errors with suggestions reduce debugging time
+   and improve developer experience.
+4. **Performance visibility**: Gates-per-second, memory consumption, and backend
+   selection metrics enable informed performance tuning.
+5. **Compliance ready**: Structured logging with simulation IDs supports audit
+   trail requirements.
+
+### Negative
+
+1. **Metric cardinality**: High-frequency simulations could generate significant
+   metric volume. Mitigated by aggregation at the sink level.
+2. **WASM callback overhead**: JSON serialization for WASM metrics adds ~0.1ms per
+   simulation. Acceptable for typical workloads.
+3. **Tracing overhead at DEBUG/TRACE**: Enabled tracing at low levels adds
+   measurable overhead. Production deployments should use INFO or above.
+4. **Schema evolution**: Changes to `SimulationMetrics` require versioned handling
+   in consumers.
+
+### Risks and Mitigations
+
+| Risk | Mitigation |
+|---|---|
+| Metric volume overwhelming storage | Configurable sampling rate; aggregate in sink |
+| WASM callback exceptions | Catch JS exceptions in callback wrapper; log to console |
+| Schema breaking changes | Version field in metrics; consumer-side version dispatch |
+| TRACE logging for large circuits | Qubit-count gate prevents amplitude logging above n=10 |
+
+## References
+
+- `ruvector-metrics` crate: internal metrics infrastructure
+- Rust `tracing` crate: https://docs.rs/tracing
+- OpenTelemetry Rust SDK: https://docs.rs/opentelemetry
+- ADR-QE-005: WASM Compilation Target (WASM constraints)
+- ADR-QE-011: Memory Gating & Power Management (resource monitoring)
+- Prometheus exposition format: https://prometheus.io/docs/instrumenting/exposition_formats/
diff --git a/docs/adr/quantum-engine/ADR-QE-011-memory-gating-power-management.md b/docs/adr/quantum-engine/ADR-QE-011-memory-gating-power-management.md
new file mode 100644
index 00000000..bd44ac03
--- /dev/null
+++ b/docs/adr/quantum-engine/ADR-QE-011-memory-gating-power-management.md
@@ -0,0 +1,628 @@
+# ADR-QE-011: Memory Gating & Power Management
+
+**Status**: Proposed
+**Date**: 2026-02-06
+**Authors**: ruv.io, RuVector Team
+**Deciders**: Architecture Review Board
+
+---
+
+## Context
+
+ruVector is designed to operate within the Cognitum computing paradigm: a tile-based
+architecture with 256 low-power processor cores, event-driven activation, and
+aggressive power gating. Agents (software components) remain fully dormant until an
+event triggers their activation. Once their work completes, they release all
+resources and return to dormancy.
+
+The quantum simulation engine must adhere to this model:
+
+1. **Zero idle footprint**: When no simulation is running, the engine consumes zero
+   CPU cycles and zero heap memory beyond its compiled code and static data.
+2. **Rapid activation**: The engine must be ready to execute a simulation within
+   microseconds of receiving a request.
+3. **Prompt resource release**: Upon simulation completion (or failure), all
+   allocated memory is immediately freed.
+4. **Predictable memory**: Callers must be able to determine exact memory
+   requirements before committing to a simulation.
+
+### Memory Scale
+
+The state vector for n qubits requires 2^n complex amplitudes, each consuming 16
+bytes (two f64 values):
+
+| Qubits | Amplitudes | Memory | Notes |
+|--------|-----------|--------|-------|
+| 10 | 1,024 | 16 KiB | Trivial |
+| 15 | 32,768 | 512 KiB | Small |
+| 20 | 1,048,576 | 16 MiB | Moderate |
+| 25 | 33,554,432 | 512 MiB | Large |
+| 28 | 268,435,456 | 4 GiB | Needs dedicated memory |
+| 30 | 1,073,741,824 | 16 GiB | Workstation-class |
+| 32 | 4,294,967,296 | 64 GiB | Server-class |
+| 35 | 34,359,738,368 | 512 GiB | HPC |
+| 40 | 1,099,511,627,776 | 16 TiB | Infeasible (state vector) |
+
+Each additional qubit doubles memory. This exponential scaling makes memory the
+primary resource constraint and the most important resource to manage.
+
+### Edge and Embedded Constraints
+
+On edge devices (embedded ruVector nodes, IoT gateways, mobile processors), memory
+is severely limited:
+
+| Platform | Typical RAM | Max qubits (state vector) |
+|----------|------------|--------------------------|
+| Cognitum tile (single) | 256 MiB | 23 |
+| Cognitum tile cluster (4) | 1 GiB | 25 |
+| Raspberry Pi 4 | 8 GiB | 28 |
+| Mobile device | 4-6 GiB | 27-28 (with other apps) |
+| Laptop | 16-64 GiB | 29-31 |
+| Server | 256-512 GiB | 33-34 |
+
+### WASM Memory Model
+
+WebAssembly uses a linear memory that can grow but cannot shrink. Once a large
+simulation allocates pages, those pages remain mapped until the WASM instance is
+destroyed. This is a fundamental platform limitation that must be documented and
+accounted for.
+
+## Decision
+
+### 1. Zero-Idle Footprint Architecture
+
+The quantum engine is implemented as a pure library with no runtime overhead:
+
+```rust
+// The engine is a collection of functions and types.
+// No background threads, no event loops, no persistent state.
+// When not called, it consumes exactly zero CPU and zero heap.
+
+pub struct QuantumEngine;  // Zero-sized type; purely a namespace
+
+impl QuantumEngine {
+    /// Execute a simulation. All resources are allocated on entry
+    /// and freed on exit (or on error).
+    pub fn execute(
+        circuit: &QuantumCircuit,
+        shots: usize,
+        config: &SimulationConfig,
+    ) -> Result<SimulationResult, SimulationError> {
+        // 1. Estimate and validate memory
+        let required = Self::estimate_memory(circuit.num_qubits());
+        Self::validate_memory_available(required)?;
+
+        // 2. Allocate state vector (the big allocation)
+        let mut state = Self::allocate_state(circuit.num_qubits())?;
+
+        // 3. Execute gates (all computation happens here)
+        Self::apply_gates(circuit, &mut state, config)?;
+
+        // 4. Measure (if requested)
+        let measurements = Self::measure(&state, shots)?;
+
+        // 5. Build result (copies out what we need)
+        let result = SimulationResult::from_state_and_measurements(
+            &state, measurements, circuit,
+        );
+
+        // 6. state is dropped here -- Vec<Complex<f64>> deallocated
+        //    No cleanup needed. No finalizers. Just drop.
+
+        Ok(result)
+    }
+    // state goes out of scope and is deallocated by Rust's ownership system
+}
+```
+
+Key properties:
+- No `new()` or `init()` methods that create persistent state.
+- No `Drop` impl with complex cleanup logic.
+- No `Arc`, `Mutex`, or shared state between calls.
+- Each call is fully independent and self-contained.
+
+### 2. On-Demand Allocation Strategy
+
+State vectors are allocated at simulation start and freed at simulation end:
+
+```rust
+fn allocate_state(n_qubits: u32) -> Result<StateVector, SimulationError> {
+    let num_amplitudes = 1_usize.checked_shl(n_qubits)
+        .ok_or(SimulationError::QubitLimitExceeded {
+            requested: n_qubits,
+            maximum: (usize::BITS - 1) as u32,
+            estimated_memory_bytes: u64::MAX,
+            available_memory_bytes: estimate_available_memory() as u64,
+        })?;
+
+    let required_bytes = num_amplitudes
+        .checked_mul(std::mem::size_of::<Complex<f64>>())
+        .ok_or(SimulationError::MemoryAllocationFailed {
+            requested_bytes: u64::MAX,
+            qubit_count: n_qubits,
+            suggestion: "Qubit count exceeds addressable memory",
+        })?;
+
+    // Attempt allocation. Rust's global allocator will return an error
+    // (with #[global_allocator] configured) or the OS will OOM-kill us.
+    // We use try_reserve to handle this gracefully.
+    let mut amplitudes = Vec::new();
+    amplitudes.try_reserve_exact(num_amplitudes)
+        .map_err(|_| SimulationError::MemoryAllocationFailed {
+            requested_bytes: required_bytes as u64,
+            qubit_count: n_qubits,
+            suggestion: "Reduce qubit count or use tensor-network backend",
+        })?;
+
+    // Initialize to |00...0> state
+    amplitudes.resize(num_amplitudes, Complex::new(0.0, 0.0));
+    amplitudes[0] = Complex::new(1.0, 0.0);
+
+    Ok(StateVector { amplitudes, n_qubits })
+}
+```
+
+The allocation sequence:
+
+```
+  IDLE (zero memory)
+    |
+    v
+  estimate_memory(n) --> returns bytes needed
+    |
+    v
+  validate_memory_available(bytes) --> checks against OS/platform limits
+    |                                   returns Err if insufficient
+    v
+  Vec::try_reserve_exact(2^n) --> attempts allocation
+    |                              returns Err on failure (no panic)
+    v
+  ALLOCATED (2^n * 16 bytes on heap)
+    |
+    v
+  [... simulation runs ...]
+    |
+    v
+  Vec::drop() --> automatic deallocation
+    |
+    v
+  IDLE (zero memory)
+```
+
+### 3. Memory Estimation API
+
+Callers can query exact memory requirements before committing:
+
+```rust
+/// Returns the number of bytes required to simulate n_qubits.
+/// This accounts for the state vector plus working memory for
+/// gate application (temporary buffers, measurement arrays, etc.).
+///
+/// # Returns
+/// - `Ok(bytes)` if the qubit count is representable
+/// - `Err(...)` if 2^n_qubits overflows usize
+pub fn estimate_memory(n_qubits: u32) -> Result<MemoryEstimate, SimulationError> {
+    let num_amplitudes = 1_usize.checked_shl(n_qubits)
+        .ok_or(SimulationError::QubitLimitExceeded {
+            requested: n_qubits,
+            maximum: (usize::BITS - 1) as u32,
+            estimated_memory_bytes: u64::MAX,
+            available_memory_bytes: 0,
+        })?;
+
+    let state_vector_bytes = num_amplitudes * std::mem::size_of::<Complex<f64>>();
+
+    // Working memory: temporary buffer for gate application (1 amplitude slice)
+    // Plus measurement result storage
+    let working_bytes = num_amplitudes * std::mem::size_of::<Complex<f64>>() / 4;
+
+    // Thread-local scratch space (per Rayon thread)
+    let thread_count = rayon::current_num_threads();
+    let scratch_per_thread = 64 * 1024; // 64 KiB per thread for local buffers
+    let thread_scratch = thread_count * scratch_per_thread;
+
+    Ok(MemoryEstimate {
+        state_vector_bytes: state_vector_bytes as u64,
+        working_bytes: working_bytes as u64,
+        thread_scratch_bytes: thread_scratch as u64,
+        total_bytes: (state_vector_bytes + working_bytes + thread_scratch) as u64,
+        num_amplitudes: num_amplitudes as u64,
+    })
+}
+
+#[derive(Debug, Clone)]
+pub struct MemoryEstimate {
+    /// Bytes for the state vector (dominant cost).
+    pub state_vector_bytes: u64,
+    /// Bytes for gate-application working memory.
+    pub working_bytes: u64,
+    /// Bytes for thread-local scratch space.
+    pub thread_scratch_bytes: u64,
+    /// Total estimated bytes.
+    pub total_bytes: u64,
+    /// Number of complex amplitudes.
+    pub num_amplitudes: u64,
+}
+
+impl MemoryEstimate {
+    /// Returns true if the estimate fits within the given byte budget.
+    pub fn fits_in(&self, available_bytes: u64) -> bool {
+        self.total_bytes <= available_bytes
+    }
+
+    /// Suggest the maximum qubits for a given memory budget.
+    pub fn max_qubits_for(available_bytes: u64) -> u32 {
+        // Each qubit doubles memory; find largest n where 20 * 2^n <= available
+        // Factor of 20 accounts for 16-byte amplitudes + 25% working memory
+        let effective = available_bytes / 20;
+        if effective == 0 { return 0; }
+        (effective.ilog2()) as u32
+    }
+}
+```
+
+### 4. Allocation Failure Handling
+
+The engine never panics on allocation failure. All paths return structured errors:
+
+```rust
+// Pattern: every allocation is fallible and returns a descriptive error.
+
+// State vector allocation failure:
+SimulationError::MemoryAllocationFailed {
+    requested_bytes: 17_179_869_184,  // 16 GiB
+    qubit_count: 30,
+    suggestion: "Reduce qubit count by 2 (to 28, ~4 GiB) or enable tensor-network backend",
+}
+
+// Integer overflow (qubit count too large):
+SimulationError::QubitLimitExceeded {
+    requested: 64,
+    maximum: 33,  // based on available memory
+    estimated_memory_bytes: u64::MAX,
+    available_memory_bytes: 68_719_476_736,  // 64 GiB
+}
+```
+
+Decision tree on allocation failure:
+
+```
+  Memory allocation failed
+    |
+    +-- Is tensor-network feature enabled?
+    |     |
+    |     +-- YES: Suggest tensor-network backend
+    |     |         (may work if circuit has low treewidth)
+    |     |
+    |     +-- NO: Suggest reducing qubit count
+    |             Calculate: max_qubits = floor(log2(available / 20))
+    |             Suggest: "Reduce to {max_qubits} qubits ({memory} bytes)"
+    |
+    +-- Is the request wildly over budget (>100x)?
+    |     |
+    |     +-- YES: "Circuit requires {X} GiB but only {Y} MiB available"
+    |     |
+    |     +-- NO: "Circuit requires {X} GiB, {Y} GiB available.
+    |              Reducing by {delta} qubits would fit."
+    |
+    +-- Return SimulationError (no panic, no abort)
+```
+
+### 5. CPU Yielding for Long Simulations
+
+For simulations estimated to exceed 100ms, the engine can optionally yield between
+gate batches to allow the OS scheduler to manage power states:
+
+```rust
+pub struct YieldConfig {
+    /// Enable cooperative yielding between gate batches.
+    /// Default: false (maximum throughput).
+    pub enabled: bool,
+
+    /// Number of gates to apply before yielding.
+    /// Default: 1000.
+    pub gates_per_slice: usize,
+
+    /// Yield mechanism.
+    /// Default: ThreadYield (std::thread::yield_now).
+    pub yield_strategy: YieldStrategy,
+}
+
+pub enum YieldStrategy {
+    /// Call std::thread::yield_now() between slices.
+    ThreadYield,
+    /// Sleep for specified duration between slices.
+    Sleep(Duration),
+    /// Call a user-provided callback between slices.
+    Callback(Box<dyn Fn(SliceProgress) + Send>),
+}
+
+pub struct SliceProgress {
+    pub gates_completed: u64,
+    pub gates_remaining: u64,
+    pub elapsed: Duration,
+    pub estimated_remaining: Duration,
+}
+
+// Usage in gate application loop:
+fn apply_gates_with_yield(
+    circuit: &QuantumCircuit,
+    state: &mut StateVector,
+    yield_config: &YieldConfig,
+) -> Result<(), SimulationError> {
+    let gates = circuit.gates();
+
+    for (i, gate) in gates.iter().enumerate() {
+        apply_single_gate(gate, state)?;
+
+        if yield_config.enabled && (i + 1) % yield_config.gates_per_slice == 0 {
+            match &yield_config.yield_strategy {
+                YieldStrategy::ThreadYield => std::thread::yield_now(),
+                YieldStrategy::Sleep(d) => std::thread::sleep(*d),
+                YieldStrategy::Callback(cb) => cb(SliceProgress {
+                    gates_completed: (i + 1) as u64,
+                    gates_remaining: (gates.len() - i - 1) as u64,
+                    elapsed: start.elapsed(),
+                    estimated_remaining: estimate_remaining(i, gates.len(), start),
+                }),
+            }
+        }
+    }
+
+    Ok(())
+}
+```
+
+Yield is **disabled by default** to maximize throughput. It is primarily intended
+for:
+- Edge devices where power management is critical.
+- Interactive applications where UI responsiveness matters.
+- Long-running simulations (>1 second) where progress reporting is needed.
+
+### 6. Thread Management
+
+The quantum engine does not create or manage its own threads:
+
+```
+  +-----------------------------------------------+
+  |              Global Rayon Thread Pool          |
+  |  (shared by all ruVector subsystems)          |
+  |                                                |
+  |  [Thread 0] [Thread 1] ... [Thread N-1]       |
+  |     ^           ^              ^               |
+  |     |           |              |               |
+  |  +--+---+   +--+---+      +---+--+            |
+  |  | ruQu |   | ruQu |      | idle |            |
+  |  | gate  |   | gate |      |      |            |
+  |  | apply |   | apply|      |      |            |
+  |  +-------+   +------+      +------+            |
+  |                                                |
+  |  During simulation: threads work on gates      |
+  |  After simulation: threads return to pool      |
+  |  Pool idle: OS can power-gate cores            |
+  +-----------------------------------------------+
+```
+
+Key properties:
+- Rayon's global thread pool is initialized once by `ruvector-core` at startup.
+- The quantum engine calls `rayon::par_iter()` and related APIs, borrowing threads
+  temporarily.
+- When simulation completes, all threads are returned to the global pool.
+- If no ruVector work is pending, Rayon threads park (blocking on a condvar),
+  consuming zero CPU. The OS can then power-gate the underlying cores.
+
+### 7. WASM Memory Considerations
+
+WebAssembly linear memory has a specific behavior that affects resource management:
+
+```
+  WASM Memory Layout
+  +------------------+------------------+
+  |  Initial pages   |  Grown pages     |
+  |  (compiled size) |  (runtime alloc) |
+  +------------------+------------------+
+  0                  initial_size       current_size
+
+  Growth: memory.grow(delta_pages) -> adds pages to the end
+  Shrink: NOT SUPPORTED in WASM spec
+
+  After 25-qubit simulation:
+  +------------------+----------------------------------+
+  |  Initial (1 MiB) |  Grown for state vec (512 MiB)  |  <- HIGH WATER MARK
+  +------------------+----------------------------------+
+
+  After simulation completes:
+  +------------------+----------------------------------+
+  |  Initial (1 MiB) |  FREED internally but pages      |
+  |                   |  still mapped (512 MiB virtual)  |
+  +------------------+----------------------------------+
+  The Rust allocator returns memory to its free list,
+  but WASM pages are not returned to the host.
+```
+
+**Implications and mitigations**:
+
+1. **Document the behavior**: Users must understand that WASM memory is a high-water
+   mark. A 25-qubit simulation permanently increases the WASM instance's memory
+   footprint to ~512 MiB.
+
+2. **Instance recycling**: For applications that run multiple simulations, create a
+   new WASM instance periodically to reset the memory high-water mark.
+
+3. **Memory budget enforcement**: The WASM host can set `WebAssembly.Memory` with a
+   `maximum` parameter to cap growth:
+
+```javascript
+const memory = new WebAssembly.Memory({
+    initial: 16,      // 1 MiB
+    maximum: 8192,     // 512 MiB cap
+});
+```
+
+4. **Pre-check in WASM**: The engine's `estimate_memory()` function works in WASM
+   and should be called before simulation to verify the allocation will succeed.
+
+### 8. Cognitum Tile Integration
+
+On Cognitum's tile-based architecture, the quantum engine maps to tiles as follows:
+
+```
+  Cognitum Processor (256 tiles)
+  +--------+--------+--------+--------+
+  | Tile 0 | Tile 1 | Tile 2 | Tile 3 |  <- Assigned to quantum sim
+  | ACTIVE | ACTIVE | ACTIVE | ACTIVE |
+  +--------+--------+--------+--------+
+  | Tile 4 | Tile 5 | Tile 6 | Tile 7 |  <- Other ruVector work (or sleeping)
+  | sleep  | vecDB  | sleep  | sleep  |
+  +--------+--------+--------+--------+
+  |  ...   |  ...   |  ...   |  ...   |
+  | sleep  | sleep  | sleep  | sleep  |  <- Power gated (zero consumption)
+  +--------+--------+--------+--------+
+```
+
+**Power state diagram for a quantum simulation lifecycle**:
+
+```
+  State: ALL_TILES_IDLE
+    |
+    | Simulation request arrives
+    v
+  State: ALLOCATING
+    Action: Wake tiles 0-3 (or however many are needed)
+    Action: Allocate state vector across tile-local memory
+    Power: Tiles 0-3 ACTIVE, rest SLEEP
+    |
+    v
+  State: SIMULATING
+    Action: Apply gates in parallel across active tiles
+    Power: Tiles 0-3 at full clock rate
+    Duration: microseconds to seconds depending on circuit
+    |
+    v
+  State: MEASURING
+    Action: Sample measurement outcomes
+    Power: Tile 0 only (measurement is sequential)
+    |
+    v
+  State: DEALLOCATING
+    Action: Free state vector
+    Action: Return tiles to idle pool
+    |
+    v
+  State: ALL_TILES_IDLE
+    Power: Tiles 0-3 back to SLEEP
+    Memory: Zero heap allocation
+```
+
+**Tile assignment policy**:
+- Small simulations (n <= 20): 1 tile sufficient.
+- Medium simulations (20 < n <= 25): 2-4 tiles for parallel gate application.
+- Large simulations (25 < n <= 30): All available tiles.
+- The tile scheduler (part of Cognitum runtime) handles assignment. The quantum
+  engine simply uses Rayon parallelism; the runtime maps Rayon threads to tiles.
+
+### 9. Memory Budget Table
+
+Quick reference for capacity planning:
+
+| Qubits | State Vector | Working Memory | Total | Platform Fit |
+|--------|-------------|---------------|-------|-------------|
+| 10 | 16 KiB | 4 KiB | 20 KiB | Any |
+| 12 | 64 KiB | 16 KiB | 80 KiB | Any |
+| 14 | 256 KiB | 64 KiB | 320 KiB | Any |
+| 16 | 1 MiB | 256 KiB | 1.3 MiB | Any |
+| 18 | 4 MiB | 1 MiB | 5 MiB | Any |
+| 20 | 16 MiB | 4 MiB | 20 MiB | Any |
+| 22 | 64 MiB | 16 MiB | 80 MiB | Cognitum single tile |
+| 24 | 256 MiB | 64 MiB | 320 MiB | Cognitum 2+ tiles |
+| 26 | 1 GiB | 256 MiB | 1.3 GiB | Cognitum cluster |
+| 28 | 4 GiB | 1 GiB | 5 GiB | Laptop / RPi 8GB |
+| 30 | 16 GiB | 4 GiB | 20 GiB | Workstation |
+| 32 | 64 GiB | 16 GiB | 80 GiB | Server |
+| 34 | 256 GiB | 64 GiB | 320 GiB | Large server |
+
+### 10. Allocation and Deallocation Sequence Diagram
+
+```
+  Caller                Engine                  OS/Allocator
+    |                     |                         |
+    |  execute(circuit)   |                         |
+    |-------------------->|                         |
+    |                     |                         |
+    |                     |  estimate_memory(n)     |
+    |                     |  validate_available()   |
+    |                     |                         |
+    |                     |  try_reserve_exact(2^n) |
+    |                     |------------------------>|
+    |                     |                         |
+    |                     |     Ok(ptr) or Err      |
+    |                     |<------------------------|
+    |                     |                         |
+    |                     |  [if Err: return        |
+    |                     |   SimulationError]      |
+    |                     |                         |
+    |                     |  initialize |00...0>    |
+    |                     |  apply gates            |
+    |                     |  measure                |
+    |                     |                         |
+    |                     |  build result           |
+    |                     |  (copies measurements,  |
+    |                     |   expectation values)   |
+    |                     |                         |
+    |                     |  drop(state_vector)     |
+    |                     |------------------------>|
+    |                     |                         |  free(ptr, 2^n * 16)
+    |                     |                         |
+    |  Ok(result)         |                         |
+    |<--------------------|                         |
+    |                     |                         |
+    |  [Engine holds ZERO |                         |
+    |   heap memory now]  |                         |
+```
+
+## Consequences
+
+### Positive
+
+1. **True zero-idle cost**: No background resource consumption. Perfectly aligned
+   with Cognitum's event-driven architecture and power gating.
+2. **Predictable memory**: `estimate_memory()` gives exact requirements before
+   committing, preventing OOM surprises.
+3. **Graceful degradation**: Allocation failures return structured errors with
+   actionable suggestions, never panics.
+4. **Platform portable**: The same allocation strategy works on native (Linux, macOS,
+   Windows), WASM, and embedded (Cognitum tiles).
+5. **No resource leaks**: Rust's ownership system guarantees deallocation on all
+   exit paths (success, error, panic).
+
+### Negative
+
+1. **No state caching**: Each simulation allocates and deallocates independently.
+   Repeated simulations on the same qubit count pay allocation cost each time.
+   Mitigation: allocation is O(2^n) but fast compared to O(G * 2^n) simulation.
+2. **WASM memory high-water mark**: Cannot reclaim WASM linear memory pages.
+   Documented as a platform limitation with instance-recycling workaround.
+3. **No memory pooling**: Could theoretically amortize allocation across simulations,
+   but this conflicts with the zero-idle-footprint requirement.
+4. **Yield overhead**: When enabled, cooperative yielding adds per-slice overhead.
+   Mitigated by making it opt-in and configurable.
+
+### Risks and Mitigations
+
+| Risk | Likelihood | Impact | Mitigation |
+|---|---|---|---|
+| OOM despite estimate_memory check | Low | Crash | Check returns conservative estimate including working memory |
+| WASM instance runs out of address space | Medium | Failure | Set `WebAssembly.Memory` maximum; document limitation |
+| Allocation latency spike (OS page faults) | Medium | Slow start | Consider `madvise` / `mlock` hints for large allocations |
+| Rayon thread pool contention | Medium | Degraded perf | Quantum engine yields between slices; Rayon work-stealing handles contention |
+
+## References
+
+- Cognitum Architecture Specification: event-driven tile-based computing
+- Rust `Vec::try_reserve_exact`: https://doc.rust-lang.org/std/vec/struct.Vec.html#method.try_reserve_exact
+- WebAssembly Memory: https://webassembly.github.io/spec/core/syntax/modules.html#memories
+- Rayon thread pool: https://docs.rs/rayon
+- ADR-QE-001: Core Engine Architecture (zero-overhead design principle)
+- ADR-QE-005: WASM Compilation Target (WASM constraints)
+- ADR-QE-009: Tensor Network Evaluation Mode (alternative for large circuits)
+- ADR-QE-010: Observability & Monitoring (memory metrics reporting)
diff --git a/docs/adr/quantum-engine/ADR-QE-012-mincut-coherence-integration.md b/docs/adr/quantum-engine/ADR-QE-012-mincut-coherence-integration.md
new file mode 100644
index 00000000..ad81cd8e
--- /dev/null
+++ b/docs/adr/quantum-engine/ADR-QE-012-mincut-coherence-integration.md
@@ -0,0 +1,876 @@
+# ADR-QE-012: Min-Cut Coherence Integration
+
+**Status**: Proposed
+**Date**: 2026-02-06
+**Authors**: ruv.io, RuVector Team
+**Deciders**: Architecture Review Board
+
+---
+
+## Context
+
+The ruVector ecosystem contains several components that must work together for
+quantum error correction (QEC) simulation:
+
+1. **ruQu (existing)**: A real-time coherence gating system that performs
+   boundary-to-boundary min-cut analysis on surface code error patterns. It includes
+   a three-filter syndrome pipeline (Structural | Shift | Evidence), a Minimum Weight
+   Perfect Matching (MWPM) decoder, and an early warning system that predicts
+   correlated failures 100+ cycles ahead.
+
+2. **ruvector-mincut (existing)**: A graph partitioning crate that computes minimum
+   cuts and balanced partitions. Currently used for vector index sharding but
+   directly applicable to syndrome graph decomposition.
+
+3. **Coherence Engine (ADR-014)**: Computes coherence energy via sheaf Laplacian
+   analysis. The "mincut-gated-transformer" concept uses coherence energy to skip
+   computation on "healthy" regions, achieving up to 50% FLOPs reduction.
+
+4. **Quantum Simulation Engine (new, ADR-QE-001 through ADR-QE-011)**: The
+   state-vector and tensor-network simulator being designed in this ADR series.
+
+The challenge is integrating these components into a coherent (pun intended)
+pipeline where simulated quantum circuits produce syndromes, those syndromes are
+decoded in real-time, and coherence analysis feeds back into simulation parameters.
+
+### Surface Code Background
+
+A distance-d surface code encodes 1 logical qubit in d^2 data qubits + (d^2 - 1)
+ancilla qubits:
+
+| Distance | Data qubits | Ancilla qubits | Total qubits | Error threshold |
+|----------|------------|----------------|--------------|----------------|
+| 3 | 9 | 8 | 17 | ~1% |
+| 5 | 25 | 24 | 49 | ~1% |
+| 7 | 49 | 48 | 97 | ~1% |
+| 9 | 81 | 80 | 161 | ~1% |
+| 11 | 121 | 120 | 241 | ~1% |
+
+Syndrome extraction involves measuring ancilla qubits each cycle. The measurement
+outcomes (syndromes) indicate where errors may have occurred. The decoder's job is
+to determine the most likely error pattern from the syndrome and apply corrections.
+
+### Performance Requirements
+
+ruQu's existing decoder targets P99 latency of <4 microseconds for syndrome
+decoding. The integrated simulation + decode pipeline must meet:
+
+| Operation | Target latency | Notes |
+|-----------|---------------|-------|
+| Single syndrome decode | <4 us | Existing ruQu target (MWPM) |
+| Syndrome extraction sim | <5 ms | One round of ancilla measurement |
+| Full cycle (sim + decode) | <10 ms | Distance-3, single error cycle |
+| Full cycle (sim + decode) | <50 ms | Distance-5 |
+| Full cycle (sim + decode) | <200 ms | Distance-7 (tensor network) |
+| Early warning evaluation | <1 ms | Check predicted vs actual syndromes |
+
+## Decision
+
+### 1. Architecture Overview
+
+The integration follows a pipeline architecture where data flows from quantum
+simulation through syndrome extraction, filtering, decoding, and coherence analysis:
+
+```
++------------------------------------------------------------------+
+|                  Quantum Error Correction Pipeline                 |
++------------------------------------------------------------------+
+|                                                                    |
+|  +------------------+     +---------------------+                  |
+|  | Quantum Circuit  |     | Error Model         |                  |
+|  | (surface code    |---->| (depolarizing,      |                  |
+|  |  syndrome        |     |  biased noise,      |                  |
+|  |  extraction)     |     |  correlated)        |                  |
+|  +------------------+     +---------------------+                  |
+|           |                        |                               |
+|           v                        v                               |
+|  +--------------------------------------------+                   |
+|  | Quantum Simulation Engine                   |                   |
+|  | (state vector or tensor network)            |                   |
+|  | - Simulates noisy syndrome extraction       |                   |
+|  | - Outputs ancilla measurement outcomes      |                   |
+|  +--------------------------------------------+                   |
+|           |                                                        |
+|           | syndrome bitstring                                     |
+|           v                                                        |
+|  +--------------------------------------------+                   |
+|  | SyndromeFilter (ruQu)                       |                   |
+|  | Filter 1: Structural (lattice geometry)     |                   |
+|  | Filter 2: Shift (temporal correlations)     |                   |
+|  | Filter 3: Evidence (statistical weight)     |                   |
+|  +--------------------------------------------+                   |
+|           |                                                        |
+|           | filtered syndrome                                      |
+|           v                                                        |
+|  +--------------------------------------------+                   |
+|  | MWPM Decoder (ruQu)                         |                   |
+|  | - Minimum Weight Perfect Matching           |                   |
+|  | - Returns Pauli correction operators        |                   |
+|  | - Target: <4 us P99 latency                 |                   |
+|  +--------------------------------------------+                   |
+|           |                                                        |
+|           | correction operators (X, Z Paulis)                     |
+|           v                                                        |
+|  +--------------------------------------------+                   |
+|  | Correction Application                      |                   |
+|  | - Apply Pauli gates to simulated state      |                   |
+|  | - Verify logical qubit integrity            |                   |
+|  +--------------------------------------------+                   |
+|           |                                                        |
+|           | corrected state                                        |
+|           v                                                        |
+|  +-----------------------+    +-------------------------+          |
+|  | Coherence Engine      |    | Early Warning System    |          |
+|  | (sheaf Laplacian)     |    | (100+ cycle prediction) |          |
+|  | - Compute coherence   |<-->| - Correlate historical  |          |
+|  |   energy              |    |   syndromes             |          |
+|  | - Gate simulation     |    | - Predict failures      |          |
+|  |   FLOPs if healthy    |    | - Feed back to sim      |          |
+|  +-----------------------+    +-------------------------+          |
+|           |                            |                           |
+|           v                            v                           |
+|  +--------------------------------------------+                   |
+|  | Cryptographic Audit Trail                   |                   |
+|  | - Ed25519 signed decisions                  |                   |
+|  | - Blake3 hash chains                        |                   |
+|  | - Every syndrome, decode, correction logged |                   |
+|  +--------------------------------------------+                   |
+|                                                                    |
++------------------------------------------------------------------+
+```
+
+### 2. Syndrome-to-Decoder Bridge
+
+The quantum simulation engine outputs raw measurement bitstrings. These are
+converted to the syndrome format expected by ruQu's decoder:
+
+```rust
+/// Bridge between quantum simulation output and ruQu decoder input.
+pub struct SyndromeBridge;
+
+impl SyndromeBridge {
+    /// Convert simulation measurement outcomes to ruQu syndrome format.
+    ///
+    /// The simulation measures ancilla qubits. A detection event occurs
+    /// when an ancilla measurement differs from the previous round
+    /// (or from the expected value in the first round).
+    pub fn extract_syndrome(
+        measurements: &MeasurementOutcome,
+        code: &SurfaceCodeLayout,
+        previous_round: Option<&SyndromeRound>,
+    ) -> SyndromeRound {
+        let mut detections = Vec::new();
+
+        for ancilla in code.ancilla_qubits() {
+            let current = measurements.get(ancilla.index());
+            let previous = previous_round
+                .map(|r| r.get(ancilla.id()))
+                .unwrap_or(0);  // Expected value in first round
+
+            if current != previous {
+                detections.push(Detection {
+                    ancilla_id: ancilla.id(),
+                    ancilla_type: ancilla.stabilizer_type(),  // X or Z
+                    position: ancilla.lattice_position(),
+                    round: measurements.round_number(),
+                });
+            }
+        }
+
+        SyndromeRound {
+            round: measurements.round_number(),
+            detections,
+            raw_measurements: measurements.ancilla_bits().to_vec(),
+        }
+    }
+
+    /// Apply decoder corrections back to the simulation state.
+    pub fn apply_corrections(
+        state: &mut StateVector,
+        corrections: &DecoderCorrection,
+        code: &SurfaceCodeLayout,
+    ) {
+        for (qubit_id, pauli) in &corrections.operations {
+            let qubit_index = code.data_qubit_index(*qubit_id);
+            match pauli {
+                Pauli::X => state.apply_x(qubit_index),
+                Pauli::Z => state.apply_z(qubit_index),
+                Pauli::Y => {
+                    state.apply_x(qubit_index);
+                    state.apply_z(qubit_index);
+                }
+                Pauli::I => {}  // No correction needed
+            }
+        }
+    }
+}
+```
+
+### 3. SyndromeFilter Pipeline (ruQu Integration)
+
+The three-filter pipeline processes raw syndromes before decoding:
+
+```rust
+/// ruQu's three-stage syndrome filtering pipeline.
+pub struct SyndromeFilterPipeline {
+    structural: StructuralFilter,
+    shift: ShiftFilter,
+    evidence: EvidenceFilter,
+}
+
+impl SyndromeFilterPipeline {
+    /// Process a syndrome round through all three filters.
+    pub fn filter(&mut self, syndrome: SyndromeRound) -> FilteredSyndrome {
+        // Filter 1: Structural
+        // Removes detections inconsistent with lattice geometry.
+        // E.g., isolated detections with no nearby partner.
+        let after_structural = self.structural.apply(&syndrome);
+
+        // Filter 2: Shift
+        // Accounts for temporal correlations between rounds.
+        // Detections that appear and disappear in consecutive rounds
+        // may be measurement errors (not data errors).
+        let after_shift = self.shift.apply(&after_structural);
+
+        // Filter 3: Evidence
+        // Weights remaining detections by statistical evidence.
+        // Uses error model probabilities to assign confidence scores.
+        let after_evidence = self.evidence.apply(&after_shift);
+
+        after_evidence
+    }
+}
+```
+
+### 4. MWPM Decoder Integration
+
+The filtered syndrome feeds into ruQu's MWPM decoder:
+
+```rust
+/// Interface to ruQu's Minimum Weight Perfect Matching decoder.
+pub trait SyndromeDecoder {
+    /// Decode a filtered syndrome into correction operations.
+    /// Target: <4 microseconds P99 latency.
+    fn decode(
+        &self,
+        syndrome: &FilteredSyndrome,
+        code: &SurfaceCodeLayout,
+    ) -> DecoderCorrection;
+
+    /// Decode with timing information for performance monitoring.
+    fn decode_timed(
+        &self,
+        syndrome: &FilteredSyndrome,
+        code: &SurfaceCodeLayout,
+    ) -> (DecoderCorrection, DecoderTiming);
+}
+
+pub struct DecoderCorrection {
+    /// Pauli corrections to apply to data qubits.
+    pub operations: Vec<(QubitId, Pauli)>,
+
+    /// Confidence score (0.0 = no confidence, 1.0 = certain).
+    pub confidence: f64,
+
+    /// Whether a logical error was detected (correction may be wrong).
+    pub logical_error_detected: bool,
+
+    /// Matching weight (lower is more likely).
+    pub matching_weight: f64,
+}
+
+pub struct DecoderTiming {
+    /// Total decode time.
+    pub total_ns: u64,
+
+    /// Time spent building the matching graph.
+    pub graph_construction_ns: u64,
+
+    /// Time spent in the MWPM algorithm.
+    pub matching_ns: u64,
+
+    /// Number of detection events in the input.
+    pub num_detections: usize,
+}
+```
+
+### 5. Min-Cut Graph Partitioning for Parallel Decoding
+
+For large surface codes (distance >= 7), the syndrome graph can be partitioned
+using `ruvector-mincut` for parallel decoding:
+
+```rust
+use ruvector_mincut::{partition, PartitionConfig, WeightedGraph};
+
+/// Partition the syndrome graph for parallel decoding.
+/// This exploits spatial locality in the surface code: errors in
+/// distant regions can be decoded independently.
+pub fn parallel_decode(
+    syndrome: &FilteredSyndrome,
+    code: &SurfaceCodeLayout,
+    decoder: &dyn SyndromeDecoder,
+) -> DecoderCorrection {
+    // Build the detection graph (nodes = detections, edges = possible errors)
+    let detection_graph = build_detection_graph(syndrome, code);
+
+    // If small enough, decode directly
+    if detection_graph.num_nodes() <= 20 {
+        return decoder.decode(syndrome, code);
+    }
+
+    // Partition the detection graph using ruvector-mincut
+    let config = PartitionConfig {
+        num_partitions: estimate_partition_count(&detection_graph),
+        balance_factor: 1.2,
+        minimize: Objective::EdgeCut,
+    };
+    let partitions = partition(&detection_graph, &config);
+
+    // Decode each partition independently (in parallel via Rayon)
+    let partial_corrections: Vec<DecoderCorrection> = partitions
+        .par_iter()
+        .map(|partition| {
+            let sub_syndrome = syndrome.restrict_to(partition);
+            decoder.decode(&sub_syndrome, code)
+        })
+        .collect();
+
+    // Handle boundary edges (detections that span partitions)
+    let boundary_correction = decode_boundary_edges(
+        syndrome, code, &partitions, decoder,
+    );
+
+    // Merge all corrections
+    merge_corrections(partial_corrections, boundary_correction)
+}
+
+/// Estimate optimal partition count based on detection density.
+fn estimate_partition_count(graph: &WeightedGraph) -> usize {
+    let n = graph.num_nodes();
+    if n <= 20 { 1 }
+    else if n <= 50 { 2 }
+    else if n <= 100 { 4 }
+    else { (n / 25).min(rayon::current_num_threads()) }
+}
+```
+
+This matches ruQu's existing boundary-to-boundary min-cut analysis: the partition
+boundaries correspond to the cuts in the syndrome graph where independent decoding
+regions meet.
+
+### 6. Coherence Gating for Simulation FLOPs Reduction
+
+The sheaf Laplacian coherence energy (from ADR-014) provides a measure of how
+"healthy" a quantum state region is. High coherence energy means the region is
+behaving as expected (low error rate). This enables a novel optimization:
+
+```
+  Coherence Gating Decision Tree
+  ================================
+
+  For each region R of the surface code:
+
+    1. Compute coherence energy E(R) via sheaf Laplacian
+
+    2. Compare to thresholds:
+
+       E(R) > E_high (0.95)
+         |
+         +-- Region is HEALTHY
+         |   Action: SKIP detailed simulation for this region
+         |   Use: simplified noise model (Pauli channel approximation)
+         |   Savings: ~50% FLOPs for this region
+         |
+       E_low (0.70) < E(R) <= E_high (0.95)
+         |
+         +-- Region is NOMINAL
+         |   Action: STANDARD simulation
+         |   Use: full gate-by-gate simulation with noise
+         |   Savings: none
+         |
+       E(R) <= E_low (0.70)
+         |
+         +-- Region is DEGRADED
+         |   Action: ENHANCED simulation
+         |   Use: full simulation + additional diagnostics
+         |   Extra: log detailed error patterns, trigger early warning
+         |   Savings: negative (more work, but necessary)
+```
+
+Implementation:
+
+```rust
+/// Coherence-gated simulation mode.
+/// Uses coherence energy to decide simulation fidelity per region.
+pub struct CoherenceGatedSimulator {
+    /// Full-fidelity simulator for nominal/degraded regions.
+    full_simulator: Box<dyn SimulationBackend>,
+
+    /// Simplified simulator for healthy regions.
+    simplified_simulator: SimplifiedNoiseModel,
+
+    /// Coherence engine for computing region health.
+    coherence_engine: CoherenceEngine,
+
+    /// Thresholds for gating decisions.
+    high_threshold: f64,
+    low_threshold: f64,
+}
+
+impl CoherenceGatedSimulator {
+    /// Simulate one QEC cycle with coherence gating.
+    pub fn simulate_cycle(
+        &mut self,
+        state: &mut StateVector,
+        code: &SurfaceCodeLayout,
+        error_model: &ErrorModel,
+        history: &SyndromeHistory,
+    ) -> CycleResult {
+        // Step 1: Compute coherence energy per region
+        let regions = code.spatial_regions();
+        let coherence = self.coherence_engine.compute_regional(
+            history, &regions,
+        );
+
+        // Step 2: Classify regions and simulate accordingly
+        let mut cycle_syndromes = Vec::new();
+        let mut flops_saved = 0_u64;
+        let mut flops_total = 0_u64;
+
+        for (region, energy) in regions.iter().zip(coherence.energies()) {
+            let region_qubits = code.qubits_in_region(region);
+
+            if *energy > self.high_threshold {
+                // HEALTHY: Use simplified Pauli noise model
+                let syndrome = self.simplified_simulator.simulate_region(
+                    state, &region_qubits, error_model,
+                );
+                let full_cost = estimate_full_sim_cost(&region_qubits);
+                let simplified_cost = estimate_simplified_cost(&region_qubits);
+                flops_saved += full_cost - simplified_cost;
+                flops_total += simplified_cost;
+                cycle_syndromes.push(syndrome);
+
+            } else if *energy > self.low_threshold {
+                // NOMINAL: Full simulation
+                let syndrome = self.full_simulator.simulate_region(
+                    state, &region_qubits, error_model,
+                );
+                let cost = estimate_full_sim_cost(&region_qubits);
+                flops_total += cost;
+                cycle_syndromes.push(syndrome);
+
+            } else {
+                // DEGRADED: Full simulation + diagnostics
+                let syndrome = self.full_simulator.simulate_region_with_diagnostics(
+                    state, &region_qubits, error_model,
+                );
+                let cost = estimate_full_sim_cost(&region_qubits) * 12 / 10;
+                flops_total += cost;
+                cycle_syndromes.push(syndrome);
+
+                // Trigger early warning system
+                tracing::warn!(
+                    region = %region.id(),
+                    coherence_energy = energy,
+                    "Degraded coherence detected; enhanced monitoring active"
+                );
+            }
+        }
+
+        CycleResult {
+            syndromes: merge_region_syndromes(cycle_syndromes),
+            flops_saved,
+            flops_total,
+            coherence_energies: coherence,
+        }
+    }
+}
+```
+
+### 7. Cryptographic Audit Trail
+
+All syndrome decisions are signed and chained for tamper-evident logging, following
+the existing ruQu pattern:
+
+```rust
+use ed25519_dalek::{SigningKey, Signature, Signer};
+use blake3::Hasher;
+
+/// Cryptographically auditable decision record.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct AuditRecord {
+    /// Sequence number in the audit chain.
+    pub sequence: u64,
+
+    /// Blake3 hash of the previous record (chain linkage).
+    pub previous_hash: [u8; 32],
+
+    /// Timestamp (nanosecond precision).
+    pub timestamp_ns: u128,
+
+    /// The decision being recorded.
+    pub decision: AuditableDecision,
+
+    /// Ed25519 signature over (sequence || previous_hash || timestamp || decision).
+    pub signature: Signature,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub enum AuditableDecision {
+    /// Raw syndrome from simulation.
+    SyndromeExtracted {
+        round: u64,
+        detections: Vec<Detection>,
+        simulation_id: Uuid,
+    },
+
+    /// Filtered syndrome after pipeline.
+    SyndromeFiltered {
+        round: u64,
+        detections_before: usize,
+        detections_after: usize,
+        filters_applied: Vec<String>,
+    },
+
+    /// Decoder correction decision.
+    CorrectionApplied {
+        round: u64,
+        corrections: Vec<(QubitId, Pauli)>,
+        confidence: f64,
+        decode_time_ns: u64,
+    },
+
+    /// Coherence gating decision.
+    CoherenceGating {
+        round: u64,
+        region_id: String,
+        coherence_energy: f64,
+        decision: GatingDecision,
+        flops_saved: u64,
+    },
+
+    /// Early warning alert.
+    EarlyWarning {
+        round: u64,
+        predicted_failure_round: u64,
+        confidence: f64,
+        affected_region: String,
+    },
+
+    /// Logical error detected.
+    LogicalError {
+        round: u64,
+        error_type: String,
+        decoder_confidence: f64,
+    },
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub enum GatingDecision {
+    SkipDetailedSimulation,
+    StandardSimulation,
+    EnhancedSimulation,
+}
+
+/// Audit trail manager.
+pub struct AuditTrail {
+    signing_key: SigningKey,
+    chain_head: [u8; 32],
+    sequence: u64,
+}
+
+impl AuditTrail {
+    /// Record a decision in the audit trail.
+    pub fn record(&mut self, decision: AuditableDecision) -> AuditRecord {
+        let timestamp_ns = std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .unwrap()
+            .as_nanos();
+
+        // Compute hash of the decision content
+        let mut hasher = Hasher::new();
+        hasher.update(&self.sequence.to_le_bytes());
+        hasher.update(&self.chain_head);
+        hasher.update(&timestamp_ns.to_le_bytes());
+        hasher.update(&bincode::serialize(&decision).unwrap());
+        let content_hash = hasher.finalize();
+
+        // Sign the hash
+        let signature = self.signing_key.sign(content_hash.as_bytes());
+
+        let record = AuditRecord {
+            sequence: self.sequence,
+            previous_hash: self.chain_head,
+            timestamp_ns,
+            decision,
+            signature,
+        };
+
+        // Update chain
+        self.chain_head = *content_hash.as_bytes();
+        self.sequence += 1;
+
+        record
+    }
+}
+```
+
+### 8. Early Warning Feedback Loop
+
+ruQu's early warning system predicts correlated failures 100+ cycles ahead. This
+prediction feeds back into the simulation engine to validate decoder robustness:
+
+```rust
+/// Early warning integration with quantum simulation.
+pub struct EarlyWarningIntegration {
+    warning_system: EarlyWarningSystem,
+    error_injector: ErrorInjector,
+}
+
+impl EarlyWarningIntegration {
+    /// Check early warning predictions and optionally inject
+    /// targeted errors to validate decoder response.
+    pub fn process_cycle(
+        &mut self,
+        history: &SyndromeHistory,
+        state: &mut StateVector,
+        code: &SurfaceCodeLayout,
+    ) -> Vec<EarlyWarningAction> {
+        let predictions = self.warning_system.predict(history);
+        let mut actions = Vec::new();
+
+        for prediction in &predictions {
+            if prediction.confidence > 0.8 {
+                // High-confidence prediction: inject targeted errors
+                // to validate that the decoder handles this failure mode
+                let targeted_errors = self.error_injector.generate_targeted(
+                    &prediction.affected_region,
+                    &prediction.predicted_error_pattern,
+                    code,
+                );
+
+                actions.push(EarlyWarningAction::InjectTargetedErrors {
+                    region: prediction.affected_region.clone(),
+                    errors: targeted_errors,
+                    prediction_confidence: prediction.confidence,
+                    predicted_failure_round: prediction.failure_round,
+                });
+
+                tracing::info!(
+                    confidence = prediction.confidence,
+                    failure_round = prediction.failure_round,
+                    region = %prediction.affected_region,
+                    "Early warning: injecting targeted errors for decoder validation"
+                );
+            } else if prediction.confidence > 0.5 {
+                // Moderate confidence: increase monitoring, do not inject
+                actions.push(EarlyWarningAction::IncreasedMonitoring {
+                    region: prediction.affected_region.clone(),
+                    enhanced_diagnostics: true,
+                });
+            }
+        }
+
+        actions
+    }
+}
+
+pub enum EarlyWarningAction {
+    /// Inject targeted errors to test decoder response.
+    InjectTargetedErrors {
+        region: String,
+        errors: Vec<InjectedError>,
+        prediction_confidence: f64,
+        predicted_failure_round: u64,
+    },
+    /// Increase monitoring without error injection.
+    IncreasedMonitoring {
+        region: String,
+        enhanced_diagnostics: bool,
+    },
+}
+```
+
+### 9. Performance Targets
+
+| Pipeline stage | Target latency | Distance-3 | Distance-5 | Distance-7 |
+|---|---|---|---|---|
+| Syndrome extraction (sim) | Varies | 2 ms | 15 ms | 80 ms |
+| Syndrome filtering | <0.5 ms | 0.1 ms | 0.2 ms | 0.4 ms |
+| MWPM decoding | <4 us | 1 us | 2 us | 3.5 us |
+| Correction application | <0.1 ms | 0.01 ms | 0.05 ms | 0.08 ms |
+| Coherence computation | <1 ms | 0.3 ms | 0.5 ms | 0.8 ms |
+| Audit record creation | <0.05 ms | 0.02 ms | 0.03 ms | 0.04 ms |
+| **Total cycle** | | **~3 ms** | **~16 ms** | **~82 ms** |
+
+For distance-7 and above, the tensor network backend (ADR-QE-009) is used for
+the syndrome extraction simulation, as 97 qubits exceeds state-vector capacity.
+
+### 10. Integration Data Flow Summary
+
+```
+  +-------------------+
+  | QuantumCircuit    |   Surface code syndrome extraction circuit
+  | (parameterized by |   with noise model applied
+  |  error model)     |
+  +--------+----------+
+           |
+           v
+  +--------+----------+
+  | SimulationEngine  |   State vector (d<=5) or tensor network (d>=7)
+  | execute()         |
+  +--------+----------+
+           |
+           | MeasurementOutcome (ancilla bitstring)
+           v
+  +--------+----------+
+  | SyndromeBridge    |   Convert measurements to detection events
+  | extract_syndrome()|
+  +--------+----------+
+           |
+           | SyndromeRound
+           v
+  +--------+----------+
+  | SyndromeFilter    |   Three-stage filtering (Structural|Shift|Evidence)
+  | Pipeline          |
+  +--------+----------+
+           |
+           | FilteredSyndrome
+           v
+  +--------+----------+     +------------------+
+  | MWPM Decoder      |<--->| ruvector-mincut  |  Parallel decoding
+  | (ruQu)            |     | graph partition  |  for large codes
+  +--------+----------+     +------------------+
+           |
+           | DecoderCorrection (Pauli operators)
+           v
+  +--------+----------+
+  | Correction Apply  |   Apply X/Z/Y Paulis to simulated state
+  +--------+----------+
+           |
+           | Corrected state
+           v
+  +--------+--+------+-----+---+
+  |           |              |  |
+  v           v              v  v
+  Coherence   Early Warning  Audit Trail
+  Engine      System         (Ed25519 +
+  (sheaf      (100+ cycle    Blake3)
+  Laplacian)  prediction)
+  |           |
+  |           +---> Feeds back to simulation
+  |                 (targeted error injection)
+  |
+  +---> Coherence gating
+        (skip/standard/enhanced sim)
+        ~50% FLOPs reduction when healthy
+```
+
+### 11. API Surface
+
+The complete integration is exposed through a high-level API:
+
+```rust
+/// High-level QEC simulation with full pipeline integration.
+pub struct QecSimulator {
+    engine: QuantumEngine,
+    bridge: SyndromeBridge,
+    filter: SyndromeFilterPipeline,
+    decoder: Box<dyn SyndromeDecoder>,
+    coherence: Option<CoherenceGatedSimulator>,
+    early_warning: Option<EarlyWarningIntegration>,
+    audit: AuditTrail,
+    history: SyndromeHistory,
+}
+
+impl QecSimulator {
+    /// Run N cycles of QEC simulation.
+    pub fn run_cycles(
+        &mut self,
+        code: &SurfaceCodeLayout,
+        error_model: &ErrorModel,
+        num_cycles: usize,
+    ) -> QecSimulationResult {
+        let mut results = Vec::with_capacity(num_cycles);
+
+        for cycle in 0..num_cycles {
+            let cycle_result = self.run_single_cycle(code, error_model, cycle);
+            results.push(cycle_result);
+        }
+
+        QecSimulationResult {
+            cycles: results,
+            logical_error_rate: self.compute_logical_error_rate(&results),
+            total_flops_saved: results.iter().map(|r| r.flops_saved).sum(),
+            decoder_latency_p99: self.compute_decoder_p99(&results),
+        }
+    }
+
+    fn run_single_cycle(
+        &mut self,
+        code: &SurfaceCodeLayout,
+        error_model: &ErrorModel,
+        cycle: usize,
+    ) -> CycleResult {
+        // ... full pipeline as described above
+    }
+}
+```
+
+## Consequences
+
+### Positive
+
+1. **Unified pipeline**: Simulation, decoding, coherence analysis, and auditing
+   work together seamlessly rather than as disconnected tools.
+2. **Real performance gains**: Coherence gating can reduce simulation FLOPs by
+   ~50% for healthy regions, directly applicable to long QEC simulations.
+3. **Decoder validation**: The simulation engine provides a controlled environment
+   to test decoder correctness under various error models.
+4. **Early warning validation**: Predicted failures can be injected and the decoder's
+   response verified, increasing confidence in the early warning system.
+5. **Auditable**: Every decision in the pipeline is cryptographically signed and
+   hash-chained, meeting compliance requirements for safety-critical applications.
+6. **Leverages existing infrastructure**: `ruvector-mincut`, ruQu's decoder, and
+   the coherence engine are reused rather than reimplemented.
+
+### Negative
+
+1. **Coupling**: The integration creates dependencies between previously independent
+   crates. Changes to ruQu's syndrome format require updates to the bridge.
+   Mitigation: trait abstractions at integration boundaries.
+2. **Complexity**: The full pipeline has many stages, each with its own configuration
+   and failure modes. Mitigation: sensible defaults and the high-level `QecSimulator`
+   API that hides complexity.
+3. **Performance overhead**: Coherence computation and audit trail signing add
+   latency to each cycle. Mitigation: both are optional and can be disabled.
+4. **Tensor network dependency**: Distance >= 7 codes require the tensor network
+   backend, which is behind a feature flag and may not always be compiled in.
+
+### Risks and Mitigations
+
+| Risk | Likelihood | Impact | Mitigation |
+|---|---|---|---|
+| Coherence gating skips a region that has real errors | Low | Missed errors | Conservative thresholds; periodic full-fidelity verification cycles |
+| MWPM decoder exceeds 4us on partitioned syndrome | Medium | Latency violation | Adaptive partition count; fallback to non-partitioned decode |
+| Early warning false positives cause unnecessary error injection | Medium | Wasted cycles | Confidence threshold (>0.8) gates injection; injection is rate-limited |
+| Audit trail storage grows unboundedly | Medium | Disk exhaustion | Configurable retention; periodic pruning of old records |
+| Syndrome format version mismatch between sim and decoder | Low | Decode failure | Version field in SyndromeRound; compatibility checks at pipeline init |
+
+## References
+
+- ruQu crate: boundary-to-boundary min-cut coherence gating
+- ruQu SyndromeFilter: three-filter pipeline (Structural | Shift | Evidence)
+- `ruvector-mincut` crate: graph partitioning for parallel decoding
+- ADR-014: Coherence Engine (sheaf Laplacian coherence computation)
+- ADR-CE-001: Sheaf Laplacian (mathematical foundation)
+- ADR-QE-001: Core Engine Architecture (simulation backends)
+- ADR-QE-009: Tensor Network Evaluation Mode (large code simulation)
+- ADR-QE-010: Observability & Monitoring (metrics for pipeline stages)
+- ADR-QE-011: Memory Gating & Power Management (resource constraints)
+- Fowler et al., "Surface codes: Towards practical large-scale quantum computation" (2012)
+- Higgott, "PyMatching: A Python package for decoding quantum codes with MWPM" (2022)
+- Dennis et al., "Topological quantum memory" (2002) -- MWPM decoding
+- Ed25519: https://ed25519.cr.yp.to/
+- Blake3: https://github.com/BLAKE3-team/BLAKE3
diff --git a/docs/architecture/quantum-engine/quantum-engine-ddd-integration.md b/docs/architecture/quantum-engine/quantum-engine-ddd-integration.md
new file mode 100644
index 00000000..40647b0b
--- /dev/null
+++ b/docs/architecture/quantum-engine/quantum-engine-ddd-integration.md
@@ -0,0 +1,816 @@
+# Quantum Simulation Engine: Domain-Driven Design - Integration Patterns
+
+**Version**: 0.1
+**Date**: 2026-02-06
+**Status**: Draft
+
+---
+
+## Overview
+
+This document defines the cross-domain integration patterns, anti-corruption layers, shared kernel, and context mapping that connect the quantum simulation engine (`ruqu-core`, `ruqu-algorithms`, `ruqu-wasm`) to the existing ruVector subsystems. It specifies how the simulation domain communicates with the coherence engine, agent system, graph database, and WASM platform without contaminating bounded context boundaries.
+
+---
+
+## Context Map
+
+```
++-------------------------------------------------------------------+
+|                         CONTEXT MAP                                |
+|                                                                    |
+|  +--------------------+     Shared Kernel     +------------------+ |
+|  |                    |<----(ruvector-math)--->|                  | |
+|  |  Quantum Sim       |                       |  Coherence       | |
+|  |  Engine             |                       |  Engine          | |
+|  |  (ruqu-core,        |    Anti-Corruption    |  (ruvector-      | |
+|  |   ruqu-algorithms)  |<----(CoherenceBridge) |   coherence)     | |
+|  |                    |                       |                  | |
+|  +--------+-----------+                       +------------------+ |
+|           |                                          ^             |
+|           | Customer-Supplier                        |             |
+|           v                                          |             |
+|  +--------------------+                    +---------+--------+   |
+|  |                    |    Partnership     |                  |   |
+|  |  Agent System      |<----------------->|  Graph Database  |   |
+|  |  (claude-flow)     |                   |  (ruvector-graph)|   |
+|  |                    |                   |                  |   |
+|  +--------------------+                   +------------------+   |
+|           |                                                       |
+|           | Conformist                                            |
+|           v                                                       |
+|  +--------------------+     Published Language                    |
+|  |                    |<----(OpenQASM 3.0)                       |
+|  |  WASM Platform     |                                          |
+|  |  (ruqu-wasm)       |                                          |
+|  |                    |                                          |
+|  +--------------------+                                          |
++-------------------------------------------------------------------+
+```
+
+### Relationship Summary
+
+| Upstream | Downstream | Pattern | Shared Artifact |
+|----------|------------|---------|-----------------|
+| Quantum Engine | Coherence Engine | Anti-Corruption Layer | `CoherenceBridge` trait |
+| ruvector-math | Quantum Engine, Coherence Engine | Shared Kernel | `Complex<f64>`, SIMD traits |
+| Quantum Engine | Agent System | Customer-Supplier | `SimulationContract` |
+| ruvector-graph | Quantum Engine | Partnership | Adjacency structures |
+| External tools | Quantum Engine | Published Language | OpenQASM 3.0 |
+| WASM platform | ruqu-wasm | Conformist | WASM constraints accepted |
+
+---
+
+## 1. Anti-Corruption Layer: Coherence Bridge
+
+The Coherence Bridge translates between the quantum simulation domain language and the ruQu coherence domain. It prevents internal types from either domain from leaking into the other.
+
+### Purpose
+
+- Map syndrome bitstrings produced by surface code experiments into the `SyndromeFilter` input format expected by the coherence engine
+- Map decoder correction outputs (Pauli operators) to gate operations the simulation can apply
+- Translate coherence scores into the `CoherenceScore` value object used by simulation sessions
+- Isolate the quantum simulation engine from changes in the coherence engine's internal API
+
+### Interface
+
+```rust
+/// Anti-corruption layer between quantum simulation and coherence engine.
+///
+/// All translation between bounded contexts passes through this trait.
+/// Neither domain's internal types appear on the wrong side of this boundary.
+pub trait CoherenceBridge: Send + Sync {
+    /// Translate a quantum syndrome into a coherence engine filter input.
+    ///
+    /// The simulation produces `SyndromeBits`; the coherence engine expects
+    /// `DetectorBitmap` with specific tile routing. This method handles the
+    /// mapping, including stabilizer-to-detector index translation.
+    fn syndrome_to_filter_input(
+        &self,
+        syndrome: &SyndromeBits,
+        code_distance: u32,
+    ) -> Result<CoherenceFilterInput, BridgeError>;
+
+    /// Translate a coherence decoder correction into Pauli gate operations.
+    ///
+    /// The coherence engine's decoder outputs correction vectors in its own
+    /// format. This method maps them to `PauliOp` sequences that the
+    /// simulation engine can apply as gate operations.
+    fn correction_to_pauli_ops(
+        &self,
+        correction: &CoherenceCorrectionOutput,
+    ) -> Result<Vec<(QubitIndex, PauliOp)>, BridgeError>;
+
+    /// Query the current coherence score for a simulation region.
+    ///
+    /// Returns a domain-native `CoherenceScore` value object, hiding
+    /// the coherence engine's internal energy representation.
+    fn query_coherence_score(
+        &self,
+        region_id: &str,
+    ) -> Result<CoherenceScore, BridgeError>;
+
+    /// Submit simulation metrics to the coherence monitoring system.
+    ///
+    /// Translates `SimulationMetrics` into the coherence engine's
+    /// signal ingestion format without exposing internal types.
+    fn report_simulation_metrics(
+        &self,
+        session_id: &str,
+        metrics: &SimulationMetrics,
+    ) -> Result<(), BridgeError>;
+}
+
+/// Opaque input type for the coherence filter (ACL boundary type).
+pub struct CoherenceFilterInput {
+    pub detector_bitmap: Vec<u64>,
+    pub tile_id: u8,
+    pub round_id: u64,
+}
+
+/// Opaque output type from the coherence decoder (ACL boundary type).
+pub struct CoherenceCorrectionOutput {
+    pub corrections: Vec<(u32, u8)>,  // (qubit_index, pauli_code)
+    pub confidence: f64,
+}
+
+/// Errors specific to the bridge translation layer.
+#[derive(Debug, thiserror::Error)]
+pub enum BridgeError {
+    #[error("syndrome dimension mismatch: expected {expected}, got {actual}")]
+    SyndromeDimensionMismatch { expected: usize, actual: usize },
+
+    #[error("unknown correction code: {0}")]
+    UnknownCorrectionCode(u8),
+
+    #[error("coherence engine unavailable: {0}")]
+    CoherenceUnavailable(String),
+
+    #[error("tile routing failed for code distance {0}")]
+    TileRoutingFailed(u32),
+}
+```
+
+### Implementation Sketch
+
+```rust
+/// Production implementation backed by the ruQu coherence engine.
+pub struct RuQuCoherenceBridge {
+    /// Reference to the coherence engine's filter pipeline.
+    filter_pipeline: Arc<dyn FilterPipelineAccess>,
+    /// Stabilizer-to-detector mapping, precomputed per code distance.
+    detector_maps: HashMap<u32, StabilizerDetectorMap>,
+}
+
+impl CoherenceBridge for RuQuCoherenceBridge {
+    fn syndrome_to_filter_input(
+        &self,
+        syndrome: &SyndromeBits,
+        code_distance: u32,
+    ) -> Result<CoherenceFilterInput, BridgeError> {
+        let map = self.detector_maps.get(&code_distance)
+            .ok_or(BridgeError::TileRoutingFailed(code_distance))?;
+
+        let mut bitmap = vec![0u64; (map.detector_count + 63) / 64];
+        for (stab_idx, &fired) in syndrome.0.iter().enumerate() {
+            if fired {
+                let det_idx = map.stabilizer_to_detector(stab_idx);
+                bitmap[det_idx / 64] |= 1u64 << (det_idx % 64);
+            }
+        }
+
+        Ok(CoherenceFilterInput {
+            detector_bitmap: bitmap,
+            tile_id: map.tile_for_distance(code_distance),
+            round_id: 0, // Filled by caller
+        })
+    }
+
+    fn correction_to_pauli_ops(
+        &self,
+        correction: &CoherenceCorrectionOutput,
+    ) -> Result<Vec<(QubitIndex, PauliOp)>, BridgeError> {
+        correction.corrections.iter()
+            .map(|(qubit, code)| {
+                let op = match code {
+                    0 => PauliOp::I,
+                    1 => PauliOp::X,
+                    2 => PauliOp::Y,
+                    3 => PauliOp::Z,
+                    other => return Err(BridgeError::UnknownCorrectionCode(*other)),
+                };
+                Ok((QubitIndex(*qubit), op))
+            })
+            .collect()
+    }
+
+    fn query_coherence_score(
+        &self,
+        region_id: &str,
+    ) -> Result<CoherenceScore, BridgeError> {
+        let energy = self.filter_pipeline.current_energy(region_id)
+            .map_err(|e| BridgeError::CoherenceUnavailable(e.to_string()))?;
+        // Invert: high energy = low coherence
+        Ok(CoherenceScore(1.0 / (1.0 + energy as f64)))
+    }
+
+    fn report_simulation_metrics(
+        &self,
+        _session_id: &str,
+        _metrics: &SimulationMetrics,
+    ) -> Result<(), BridgeError> {
+        // Translate to coherence signal format and submit
+        Ok(())
+    }
+}
+```
+
+---
+
+## 2. Shared Kernel: ruvector-math
+
+Both the quantum simulation engine and the coherence engine depend on a shared mathematical foundation. Changes to `ruvector-math` must be validated against both domains before release.
+
+### Shared Types
+
+```rust
+// ruvector-math provides these types used by both domains:
+
+/// Complex number with f64 components (re, im).
+/// Used by quantum state vectors AND coherence restriction maps.
+pub struct Complex<T> {
+    pub re: T,
+    pub im: T,
+}
+
+/// Cache-line-aligned vector for SIMD operations.
+/// Used by both state vector operations and residual computation.
+#[repr(align(64))]
+pub struct AlignedVec<T> {
+    data: Vec<T>,
+}
+
+/// SIMD dispatch trait: implementations select AVX2, NEON, or scalar
+/// at runtime depending on platform capabilities.
+pub trait SimdOps {
+    fn dot_product_f64(a: &[f64], b: &[f64]) -> f64;
+    fn complex_multiply(a: &[Complex<f64>], b: &[Complex<f64>], out: &mut [Complex<f64>]);
+    fn norm_squared(v: &[Complex<f64>]) -> f64;
+    fn axpy(alpha: f64, x: &[f64], y: &mut [f64]);
+}
+```
+
+### Change Coordination Protocol
+
+1. Any proposed change to `ruvector-math` must include tests for both the quantum engine use case and the coherence engine use case.
+2. The CI pipeline runs `cargo test -p ruqu-core` and `cargo test -p ruvector-coherence` after any change to `ruvector-math`.
+3. Breaking changes require a version bump and simultaneous updates to both downstream crates.
+4. Performance regressions in SIMD operations must be caught by benchmarks in both domains.
+
+### Boundary
+
+Only the types and functions listed above cross the shared kernel boundary. Internal implementation details of `ruvector-math` (e.g., specific SIMD intrinsics, platform detection) are not shared.
+
+---
+
+## 3. Customer-Supplier: Agent System Integration
+
+The ruVector agent system (powered by claude-flow) acts as the customer, invoking the quantum simulation engine as a supplier. The contract defines what the agent can request and what it receives in return.
+
+### Contract
+
+```rust
+/// Contract for agent system access to the quantum simulation engine.
+///
+/// The agent system (customer) invokes these operations.
+/// The quantum engine (supplier) fulfills them.
+pub trait SimulationContract: Send + Sync {
+    /// Build a circuit from a high-level description.
+    fn build_circuit(&self, spec: CircuitSpec) -> Result<CircuitHandle, ContractError>;
+
+    /// Run a simulation and return results.
+    fn run_simulation(&self, circuit: CircuitHandle, config: RunConfig)
+        -> Result<SimulationOutput, ContractError>;
+
+    /// Run a VQE optimization and return the ground state energy.
+    fn run_vqe(&self, spec: VQESpec) -> Result<VQEOutput, ContractError>;
+
+    /// Query resource requirements before committing to a run.
+    fn estimate_resources(&self, circuit: CircuitHandle) -> Result<ResourceEstimate, ContractError>;
+}
+
+/// High-level circuit specification from the agent.
+pub struct CircuitSpec {
+    pub qubit_count: u32,
+    pub gate_sequence: Vec<GateSpec>,
+    pub parameters: HashMap<String, f64>,
+}
+
+/// Agent-facing gate specification (simplified from internal Gate).
+pub struct GateSpec {
+    pub gate_type: String,
+    pub target: u32,
+    pub control: Option<u32>,
+    pub angle: Option<f64>,
+}
+
+/// Configuration limits the agent can set.
+pub struct RunConfig {
+    pub max_shots: u32,
+    pub max_memory_mb: u32,
+    pub timeout_seconds: u32,
+    pub backend_preference: Option<String>,
+}
+
+/// Results returned to the agent.
+pub struct SimulationOutput {
+    pub measurement_counts: HashMap<String, u32>,
+    pub expectation_values: Vec<(String, f64)>,
+    pub metrics: SimulationMetrics,
+}
+
+/// VQE-specific results.
+pub struct VQEOutput {
+    pub ground_state_energy: f64,
+    pub optimal_parameters: Vec<f64>,
+    pub iterations: u32,
+    pub converged: bool,
+}
+
+/// Resource estimate before execution.
+pub struct ResourceEstimate {
+    pub memory_bytes: usize,
+    pub estimated_time_ms: f64,
+    pub qubit_count: u32,
+    pub gate_count: u32,
+}
+```
+
+### Agent Integration Flow
+
+```
+Agent Context         Quantum Engine            Result
+    |                      |                      |
+    | 1. build_circuit()   |                      |
+    |--------------------->|                      |
+    |   CircuitHandle      |                      |
+    |<---------------------|                      |
+    |                      |                      |
+    | 2. estimate_resources|                      |
+    |--------------------->|                      |
+    |   ResourceEstimate   |                      |
+    |<---------------------|                      |
+    |                      |                      |
+    | 3. run_simulation()  |                      |
+    |--------------------->|                      |
+    |                      | [executes internally]|
+    |                      |---+                  |
+    |                      |   | circuit -> state |
+    |                      |   | gates -> measure |
+    |                      |<--+                  |
+    |   SimulationOutput   |                      |
+    |<---------------------|                      |
+    |                      |                      |
+    | 4. Agent acts on     |                      |
+    |    results           |                      |
+    v                      v                      v
+```
+
+### Resource Limits
+
+The supplier enforces resource limits set by the customer:
+
+- Memory: Capped at `max_memory_mb`; returns error if state vector exceeds budget
+- Time: Monitored per-step; simulation aborted if `timeout_seconds` exceeded
+- Qubits: Platform limit (30 for state vector, higher for tensor network) communicated via `estimate_resources`
+
+---
+
+## 4. Published Language: OpenQASM Compatibility
+
+A future integration point for importing and exporting circuits in the OpenQASM 3.0 standard, enabling interoperability with IBM Qiskit, Google Cirq, and other quantum frameworks.
+
+### Translation Layer
+
+```rust
+/// Trait for OpenQASM import/export.
+pub trait OpenQASMTranslator {
+    /// Parse an OpenQASM 3.0 string into the internal circuit representation.
+    fn import(&self, qasm: &str) -> Result<QuantumCircuit, TranslationError>;
+
+    /// Export an internal circuit to OpenQASM 3.0 format.
+    fn export(&self, circuit: &QuantumCircuit) -> Result<String, TranslationError>;
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum TranslationError {
+    #[error("unsupported gate in OpenQASM: {0}")]
+    UnsupportedGate(String),
+
+    #[error("parse error at line {line}: {message}")]
+    ParseError { line: u32, message: String },
+
+    #[error("circuit uses features not supported by OpenQASM 3.0: {0}")]
+    UnsupportedFeature(String),
+}
+```
+
+### Scope
+
+- Phase 1: Import basic gate circuits (H, CNOT, Rz, measure)
+- Phase 2: Export circuits with parameter bindings
+- Phase 3: Support custom gate definitions and classical control flow
+
+---
+
+## 5. Conformist: WASM Platform
+
+The `ruqu-wasm` crate conforms to WASM platform constraints without attempting to work around them. Limitations are accepted as-is, with graceful degradation where capabilities are reduced.
+
+### Accepted Constraints
+
+| Constraint | Impact | Mitigation |
+|------------|--------|------------|
+| No native threads | Single-threaded execution | Sequential gate application; no rayon |
+| 4GB memory limit | Max ~25 qubits (state vector) | Tensor network backend for larger circuits |
+| No filesystem | Cannot persist results | Return all data via JS callbacks |
+| No system clock | Timing metrics unavailable | Use `performance.now()` via JS bridge |
+| No SIMD (some runtimes) | Slower math | Feature-gated SIMD; scalar fallback |
+
+### WASM API Surface
+
+```rust
+/// Public API exposed to JavaScript via wasm-bindgen.
+///
+/// This is the conformist boundary: we accept WASM constraints
+/// and expose only what the platform allows.
+#[cfg(target_arch = "wasm32")]
+pub mod wasm_api {
+    use wasm_bindgen::prelude::*;
+
+    #[wasm_bindgen]
+    pub struct WasmSimulator {
+        session: SimulationSession,
+    }
+
+    #[wasm_bindgen]
+    impl WasmSimulator {
+        /// Create a new simulator for the given qubit count.
+        #[wasm_bindgen(constructor)]
+        pub fn new(qubit_count: u32) -> Result<WasmSimulator, JsValue> {
+            // Enforce WASM-specific qubit limit
+            if qubit_count > 25 {
+                return Err(JsValue::from_str(
+                    "WASM platform supports at most 25 qubits in state vector mode"
+                ));
+            }
+            // ... construction
+            Ok(WasmSimulator { session: todo!() })
+        }
+
+        /// Add a gate to the circuit.
+        pub fn add_gate(&mut self, gate_type: &str, target: u32, control: Option<u32>)
+            -> Result<(), JsValue> { Ok(()) }
+
+        /// Run the simulation and return measurement counts as JSON.
+        pub fn run(&mut self, shots: u32) -> Result<String, JsValue> {
+            Ok("{}".to_string())
+        }
+
+        /// Get memory usage estimate in bytes.
+        pub fn memory_estimate(&self) -> usize { 0 }
+    }
+}
+```
+
+---
+
+## 6. Partnership: Graph Database Integration
+
+The `ruvector-graph` crate and the quantum simulation engine have a bidirectional partnership around graph-structured problems, particularly QAOA and MaxCut.
+
+### Data Flow
+
+```rust
+/// Graph data provided by ruvector-graph for quantum optimization.
+pub struct GraphProblem {
+    pub vertex_count: u32,
+    pub edges: Vec<(u32, u32, f64)>,  // (source, target, weight)
+    pub problem_type: GraphProblemType,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum GraphProblemType { MaxCut, GraphColoring, TSP }
+
+/// Results returned to ruvector-graph for annotation.
+pub struct QuantumGraphResult {
+    pub objective_value: CutValue,
+    pub partition: Vec<bool>,
+    pub confidence: f64,
+    pub circuit_depth: CircuitDepth,
+}
+
+/// Partnership interface: both sides contribute and consume.
+pub trait GraphQuantumPartnership {
+    /// Graph -> Quantum: convert graph problem to QAOA circuit.
+    fn graph_to_qaoa_circuit(
+        &self,
+        problem: &GraphProblem,
+        layers: u32,
+    ) -> Result<QuantumCircuit, DomainError>;
+
+    /// Quantum -> Graph: feed optimization results back as graph annotations.
+    fn annotate_graph_with_result(
+        &self,
+        problem: &GraphProblem,
+        result: &QuantumGraphResult,
+    ) -> Result<GraphAnnotation, DomainError>;
+
+    /// Shared interest: partition graph using ruvector-mincut for subproblem decomposition.
+    fn decompose_problem(
+        &self,
+        problem: &GraphProblem,
+        max_subproblem_qubits: u32,
+    ) -> Result<Vec<GraphProblem>, DomainError>;
+}
+
+/// Annotation written back to the graph database.
+pub struct GraphAnnotation {
+    pub vertex_labels: HashMap<u32, String>,
+    pub edge_labels: HashMap<(u32, u32), String>,
+    pub metadata: HashMap<String, String>,
+}
+```
+
+---
+
+## Cross-Cutting Concerns
+
+### Error Handling Across Boundaries
+
+Each bounded context defines its own error type. At integration boundaries, errors are translated through the ACL rather than propagated directly.
+
+```rust
+/// Integration boundary error: wraps domain errors from either side.
+#[derive(Debug, thiserror::Error)]
+pub enum IntegrationError {
+    #[error("quantum engine error: {0}")]
+    QuantumEngine(#[from] DomainError),
+
+    #[error("coherence bridge error: {0}")]
+    CoherenceBridge(#[from] BridgeError),
+
+    #[error("contract violation: {0}")]
+    ContractViolation(String),
+
+    #[error("resource limit exceeded: {0}")]
+    ResourceLimit(String),
+}
+```
+
+### Observability
+
+Distributed tracing spans cross crate boundaries with a shared trace context.
+
+- Each integration call propagates a `TraceId` through the ACL
+- The coherence bridge logs translation events at `DEBUG` level
+- Agent contract calls log at `INFO` with duration and resource usage
+- WASM calls use `console.log` via the JS bridge when tracing is enabled
+
+### Resource Management
+
+Memory and thread resources are coordinated with the ruVector runtime.
+
+- State vector allocation checks the global memory budget before proceeding
+- Tensor network contractions respect thread pool limits shared with rayon
+- WASM mode has a fixed 4GB ceiling enforced at the conformist boundary
+- All resource allocation events emit `MemoryAllocated` / `MemoryReleased` domain events
+
+### Configuration Propagation
+
+Configuration flows from the ruVector root config into the quantum engine.
+
+```rust
+/// Quantum engine configuration derived from ruVector global config.
+pub struct QuantumEngineConfig {
+    pub max_qubits: u32,
+    pub default_backend: BackendType,
+    pub memory_budget_bytes: usize,
+    pub thread_count: usize,
+    pub coherence_bridge_enabled: bool,
+    pub wasm_mode: bool,
+}
+
+impl From<&RuVectorConfig> for QuantumEngineConfig {
+    fn from(global: &RuVectorConfig) -> Self {
+        Self {
+            max_qubits: global.quantum.max_qubits.unwrap_or(30),
+            default_backend: global.quantum.backend.parse().unwrap_or(BackendType::StateVector),
+            memory_budget_bytes: global.memory.budget_bytes,
+            thread_count: global.runtime.thread_count,
+            coherence_bridge_enabled: global.coherence.enabled,
+            wasm_mode: cfg!(target_arch = "wasm32"),
+        }
+    }
+}
+```
+
+---
+
+## Event Flow Diagrams
+
+### 1. VQE Optimization Flow
+
+```
+Agent              CircuitBuilder     SimSession       QuantumState      Optimizer
+  |                     |                |                 |                |
+  | build_circuit(spec) |                |                 |                |
+  |-------------------->|                |                 |                |
+  |   CircuitHandle     |                |                 |                |
+  |<--------------------|                |                 |                |
+  |                     |                |                 |                |
+  | run_vqe(spec)       |                |                 |                |
+  |-------------------------------------------------------------->|        |
+  |                     |                |                 |  init(params)  |
+  |                     |                |                 |<---------------|
+  |                     |                |                 |                |
+  |                     |          +-----|---LOOP----------|--------+       |
+  |                     |          |     |                 |        |       |
+  |                     |          | start()              |        |       |
+  |                     |          |     |----->|          |        |       |
+  |                     |          |     | apply_gates()   |        |       |
+  |                     |          |     |     |---------->|        |       |
+  |                     |          |     |     |  expectation_value |       |
+  |                     |          |     |     |---------->|        |       |
+  |                     |          |     |     |  energy   |        |       |
+  |                     |          |     |<----|-----------|        |       |
+  |                     |          |     |                 | update(grad)   |
+  |                     |          |     |                 |------->|       |
+  |                     |          |     |                 | new_params     |
+  |                     |          |     |                 |<-------|       |
+  |                     |          +-----|---END LOOP------|--------+       |
+  |                     |                |                 |                |
+  |  VQEOutput(energy, params)           |                 |                |
+  |<-------------------------------------------------------------|        |
+  |                     |                |                 |                |
+```
+
+### 2. Surface Code QEC with Coherence Bridge
+
+```
+SurfaceCodeExp     NoiseService    CoherenceBridge    ruQu Filters     Decoder
+  |                    |                |                  |               |
+  | run_cycle()        |                |                  |               |
+  |--+                 |                |                  |               |
+  |  | inject_errors() |                |                  |               |
+  |  |---------------->|                |                  |               |
+  |  | error_list      |                |                  |               |
+  |  |<----------------|                |                  |               |
+  |  |                 |                |                  |               |
+  |  | extract_syndrome()               |                  |               |
+  |  |--+              |                |                  |               |
+  |  |  | SyndromeBits |                |                  |               |
+  |  |<-+              |                |                  |               |
+  |  |                 |                |                  |               |
+  |  | syndrome_to_filter_input()       |                  |               |
+  |  |--------------------------------->|                  |               |
+  |  |                 | FilterInput    |                  |               |
+  |  |                 |                |  process()       |               |
+  |  |                 |                |----------------->|               |
+  |  |                 |                |  Verdict         |               |
+  |  |                 |                |<-----------------|               |
+  |  |                 |                |                  |               |
+  |  |                 | correction_to_pauli_ops()         |               |
+  |  |<---------------------------------|                  |               |
+  |  |                 |                |                  |               |
+  |  | decode(syndrome)|                |                  |               |
+  |  |------------------------------------------------------------------>|
+  |  | correction      |                |                  |               |
+  |  |<------------------------------------------------------------------|
+  |  |                 |                |                  |               |
+  |  | check_logical_error()            |                  |               |
+  |  |--+              |                |                  |               |
+  |  |  | bool         |                |                  |               |
+  |  |<-+              |                |                  |               |
+  |  |                 |                |                  |               |
+  | CycleReport       |                |                  |               |
+  |<-+                 |                |                  |               |
+```
+
+### 3. WASM Deployment Flow
+
+```
+Browser JS          ruqu-wasm (WASM)       ruqu-core           Results
+  |                      |                     |                   |
+  | new WasmSimulator(n) |                     |                   |
+  |--------------------->|                     |                   |
+  |                      | QuantumState::new(n)|                   |
+  |                      |-------------------->|                   |
+  |                      | state               |                   |
+  |                      |<--------------------|                   |
+  |  WasmSimulator       |                     |                   |
+  |<---------------------|                     |                   |
+  |                      |                     |                   |
+  | add_gate("h", 0)     |                     |                   |
+  |--------------------->|                     |                   |
+  |                      | circuit.add_gate()  |                   |
+  |                      |-------------------->|                   |
+  |  Ok                  |                     |                   |
+  |<---------------------|                     |                   |
+  |                      |                     |                   |
+  | add_gate("cx", 1, 0) |                     |                   |
+  |--------------------->|                     |                   |
+  |                      | circuit.add_gate()  |                   |
+  |                      |-------------------->|                   |
+  |  Ok                  |                     |                   |
+  |<---------------------|                     |                   |
+  |                      |                     |                   |
+  | run(1000)            |                     |                   |
+  |--------------------->|                     |                   |
+  |                      | session.start()     |                   |
+  |                      |-------------------->|                   |
+  |                      | run_to_completion() |                   |
+  |                      |-------------------->|                   |
+  |                      |                     | [gate loop]       |
+  |                      |                     |---+               |
+  |                      |                     |   | apply_gate()  |
+  |                      |                     |<--+               |
+  |                      |                     | measure()         |
+  |                      |                     |---+               |
+  |                      |                     |   | outcomes      |
+  |                      |                     |<--+               |
+  |                      | SimulationMetrics   |                   |
+  |                      |<--------------------|                   |
+  |                      |                     |                   |
+  |                      | JSON.serialize(counts)                  |
+  |                      |---------------------------------------->|
+  |  "{\"00\": 503, \"11\": 497}"              |                   |
+  |<---------------------|                     |                   |
+  |                      |                     |                   |
+  | [JS callback with results]                 |                   |
+  |                      |                     |                   |
+```
+
+---
+
+## Migration Strategy
+
+### Phase 1: Standalone ruqu-core
+
+**Goal**: A self-contained crate with no external dependencies except `ruvector-math`.
+
+- Implement `QuantumCircuit`, `QuantumState`, `SimulationSession` aggregates
+- Implement `CircuitBuilder`, `GateFusionService`, `NoiseInjectionService`
+- All value objects and domain events defined
+- Unit tests and property-based tests for normalization, gate unitarity
+- No coherence bridge, no agent integration, no WASM
+
+**Dependency**: `ruvector-math` (shared kernel only)
+
+### Phase 2: ruqu-algorithms + Coherence Integration
+
+**Goal**: Add VQE, surface code experiments, and the coherence bridge.
+
+- Implement `VQEOptimization`, `SurfaceCodeExperiment` aggregates
+- Implement `TensorNetworkState` for circuits exceeding state vector limits
+- Build `CoherenceBridge` anti-corruption layer
+- Integrate with ruQu `FilterPipeline` and `MWPMDecoder`
+- Add `PauliExpectationService`, `ContractionPathOptimizer`
+- Integration tests: VQE convergence, surface code logical error rate vs theory
+
+**Dependencies**: `ruqu-core`, `ruvector-math`, `ruqu` (coherence bridge target)
+
+### Phase 3: ruqu-wasm
+
+**Goal**: Deploy to browser environments with graceful degradation.
+
+- Implement `WasmSimulator` conformist wrapper
+- Add `wasm-bindgen` API surface
+- Enforce WASM constraints (25-qubit limit, no threads, no filesystem)
+- JavaScript test harness running circuits in headless browser
+- Performance benchmarks: gate throughput in WASM vs native
+
+**Dependencies**: `ruqu-core`, `wasm-bindgen`, `wasm-pack`
+
+### Phase 4: Full Agent System Integration
+
+**Goal**: Complete customer-supplier integration with the claude-flow agent system.
+
+- Implement `SimulationContract` trait and production adapter
+- Add resource estimation and budget enforcement
+- Implement `GraphQuantumPartnership` for QAOA/MaxCut
+- Integration with `ruvector-graph` for graph problem decomposition
+- End-to-end tests: agent builds circuit, runs simulation, acts on results
+- OpenQASM import/export (published language)
+
+**Dependencies**: All previous phases, `ruvector-graph`, `claude-flow` agent SDK
+
+---
+
+## References
+
+1. Evans, E. (2003). "Domain-Driven Design: Tackling Complexity in the Heart of Software."
+2. Vernon, V. (2013). "Implementing Domain-Driven Design." Chapter 13: Integrating Bounded Contexts.
+3. Coherence Engine DDD: `docs/architecture/coherence-engine-ddd.md`
+4. ruQu crate: `crates/ruQu/`
+5. ruvector-math: shared kernel for SIMD and complex number operations
+6. OpenQASM 3.0 specification: https://openqasm.com/
diff --git a/docs/architecture/quantum-engine/quantum-engine-ddd-strategic.md b/docs/architecture/quantum-engine/quantum-engine-ddd-strategic.md
new file mode 100644
index 00000000..027e69cd
--- /dev/null
+++ b/docs/architecture/quantum-engine/quantum-engine-ddd-strategic.md
@@ -0,0 +1,530 @@
+# Quantum Simulation Engine: Domain-Driven Design - Strategic Design
+
+**Version**: 0.1
+**Date**: 2026-02-06
+**Status**: Draft
+
+---
+
+## Domain Vision
+
+The Quantum Simulation Engine provides **on-device quantum algorithm experimentation** within ruVector's always-on, agentic environment. It enables hybrid classical-quantum research on edge devices, allowing agents to leverage quantum algorithms (VQE, Grover, QAOA, QEC) without cloud services.
+
+> **This is not a cloud quantum API.** The engine answers: "What does this quantum circuit produce?" entirely on the local device, using classical state-vector simulation with SIMD acceleration.
+
+The engine follows ruVector's event-driven model: **inert when idle, activated on demand, resources released immediately**. A 20-qubit simulation allocates 16 MiB of state vector on activation and frees it the moment the circuit completes. No background threads, no persistent memory, no warm pools.
+
+### The Universal Simulation Object
+
+The power lies in a **single underlying state-vector engine** inside ruqu-sim. Once the linear algebra is fixed, everything else becomes interpretation:
+
+| Domain | Qubits Become | Gates Become | Measurement Becomes | Circuit Becomes |
+|--------|---------------|--------------|---------------------|-----------------|
+| **Chemistry** | Molecular orbitals | Fermionic operators | Energy estimates | VQE ansatz |
+| **Optimization** | Decision variables | Mixing/cost ops | Cut values | QAOA circuit |
+| **Search** | Database indices | Oracle + diffusion | Found element | Grover iterations |
+| **Error Correction** | Data + ancilla qubits | Stabilizer checks | Syndrome bits | QEC cycle |
+| **Cryptography** | Key register bits | Quantum Fourier transform | Period estimate | Shor subroutine |
+| **Machine Learning** | Feature dimensions | Parameterized rotations | Classification | Quantum kernel |
+
+**Same linear algebra, different interpretations. Same state vector = superposition. Same measurement = probabilistic collapse with Born rule.**
+
+---
+
+## Strategic Design
+
+### Core Domain
+
+**Quantum State Simulation** - The heart of the system, managing quantum state vectors, applying unitary gate operations, and performing projective measurements. This is where the primary complexity and innovation reside. **Most circuits run in a single fast pass; only large entangled states or iterative variational loops require sustained computation.**
+
+### Supporting Domains
+
+1. **Circuit Construction** - Building, validating, and optimizing quantum circuits
+2. **State Management** - State vector lifecycle, entanglement tracking, memory gating
+3. **Measurement & Observation** - Projective measurement, expectation values, syndrome extraction
+4. **Algorithm Execution** - High-level quantum algorithm implementations (VQE, Grover, QAOA, QEC)
+5. **Optimization & Backend** - SIMD acceleration, gate fusion, tensor network backends
+6. **Deployment & Integration** - WASM compilation, agent bridge, coherence bridge to ruQu
+
+### Generic Domains
+
+1. **Linear Algebra** - Complex number math, matrix-vector products, Kronecker products (via `ruvector-math`)
+2. **Random Sampling** - Measurement outcome sampling, noise injection (via `rand` crate)
+3. **Logging/Tracing** - Event recording, performance metrics (via `tracing` crate + `ruvector-metrics`)
+
+### Application Evolution
+
+| Timeline | Capabilities | Key Value |
+|----------|-------------|-----------|
+| **Phase 1 (Now)** | State vector sim, basic gates, VQE/Grover/QAOA | Local quantum experimentation without cloud |
+| **Phase 2 (6mo)** | Tensor networks, noise models, surface code cycles | Error correction research on edge devices |
+| **Phase 3 (12mo)** | GPU acceleration, OpenQASM 3.0 import, 30+ qubits | Production-grade quantum algorithm research |
+| **Phase 4 (24mo)** | Quantum hardware bridge, hybrid cloud-local execution | Real quantum device integration |
+
+> **Edge-First Quantum**: The system eventually enables agents to reason about quantum algorithms without any network dependency.
+
+---
+
+## Ecosystem Integration Map
+
+```
++---------------------------------------------------------------------------+
+|                        QUANTUM SIMULATION ENGINE                          |
+|                                                                           |
+|  +-------------------------------------------------------------------+   |
+|  |                  CIRCUIT CONSTRUCTION DOMAIN                       |   |
+|  |  QuantumCircuit | Gate | GateSchedule | CircuitOptimizer          |   |
+|  |  Parameterized templates (VQE ansatz, QAOA mixer, Grover oracle)  |   |
+|  +-------------------------------------------------------------------+   |
+|                              |                                            |
+|                              v                                            |
+|  +-----------------------------+  +-----------------------------+        |
+|  | CORE: QUANTUM STATE         |  | STATE MANAGEMENT            |        |
+|  | SIMULATION                  |<-| DOMAIN                      |        |
+|  |                             |  |                             |        |
+|  | * State vector engine       |  | * Allocation / deallocation |        |
+|  | * Gate application (SIMD)   |  | * Entanglement tracking     |        |
+|  | * Unitary evolution         |  | * Memory gating (zero-idle) |        |
+|  | * Tensor contraction        |  | * State checkpointing       |        |
+|  +-----------------------------+  +-----------------------------+        |
+|           |            |                      |                           |
+|           v            v                      v                           |
+|  +-----------------------------+  +-----------------------------+        |
+|  | MEASUREMENT &               |  | ALGORITHM EXECUTION         |        |
+|  | OBSERVATION DOMAIN          |  | DOMAIN                      |        |
+|  |                             |  |                             |        |
+|  | * Projective measurement    |  | * VQE + classical optimizer |        |
+|  | * Expectation values        |  | * Grover auto-iteration     |        |
+|  | * Shot-based sampling       |  | * QAOA graph-based circuits |        |
+|  | * Syndrome extraction       |  | * Surface code + decoder    |        |
+|  +-----------------------------+  +-----------------------------+        |
+|                                            |                              |
+|                                            v                              |
+|  +-----------------------------+  +-----------------------------+        |
+|  | OPTIMIZATION &              |  | DEPLOYMENT &                |        |
+|  | BACKEND DOMAIN              |  | INTEGRATION DOMAIN          |        |
+|  |                             |  |                             |        |
+|  | * SIMD dispatch             |  | * WASM bindings (ruqu-wasm) |        |
+|  | * Gate fusion               |  | * Agent bridge (activation) |        |
+|  | * Tensor network backend    |  | * Observability / metrics   |        |
+|  | * Cache-local strategies    |  | * Coherence bridge (ruQu)   |        |
+|  +-----------------------------+  +-----------------------------+        |
+|                                                                           |
++---------------------------------------------------------------------------+
+                              |
+         +--------------------+---------------------+
+         |                    |                      |
+         v                    v                      v
+  +--------------+   +-----------------+   +------------------+
+  | ruvector-    |   | ruvector-       |   | ruQu             |
+  | math (SIMD)  |   | metrics         |   | (decoder bridge) |
+  +--------------+   +-----------------+   +------------------+
+         |                                          |
+         v                                          v
+  +--------------+   +-----------------+   +------------------+
+  | ruvector-    |   | ruvector-       |   | cognitum-gate-   |
+  | graph        |   | nervous-system  |   | kernel (tiles)   |
+  +--------------+   +-----------------+   +------------------+
+         |                    |
+         v                    v
+  +--------------+   +-----------------+
+  | ruvector-    |   | sona (adaptive  |
+  | mincut       |   |  learning)      |
+  +--------------+   +-----------------+
+```
+
+### Crate-to-Context Mapping
+
+| Bounded Context | Primary Crate | Supporting Crates |
+|-----------------|---------------|-------------------|
+| Circuit Construction | `ruqu-sim` (new) | - |
+| Quantum State Simulation (Core) | `ruqu-sim` (new) | `ruvector-math` |
+| State Management | `ruqu-sim` (new) | - |
+| Measurement & Observation | `ruqu-sim` (new) | `rand` |
+| Algorithm Execution | `ruqu-sim` (new) | `ruvector-graph` (QAOA) |
+| Optimization & Backend | `ruqu-sim` (new) | `ruvector-math` (SIMD) |
+| Deployment & Integration | `ruqu-wasm` (new) | `ruqu`, `ruvector-metrics`, `ruvector-nervous-system` |
+
+---
+
+## Context Map
+
+```
++-----------------------------------------------------------------------+
+|                     QUANTUM ENGINE CONTEXT MAP                         |
+|                                                                        |
+|                     [Published Language]                                |
+|                     OpenQASM 3.0 format                                |
+|                            |                                           |
+|                            v                                           |
+|   +------------------+         +------------------+                    |
+|   |                  | Shared  |                  |                    |
+|   |  CIRCUIT         | Kernel  |  STATE           |                    |
+|   |  CONSTRUCTION    |<------->|  MANAGEMENT      |                    |
+|   |                  | (Gate,  |                  |                    |
+|   |  Builds circuits | QubitIdx|  Allocates and   |                    |
+|   |  Validates gates |  types) |  tracks state    |                    |
+|   +--------+---------+         +--------+---------+                    |
+|            |                            |                              |
+|            | Customer                   | Customer                     |
+|            | Supplier                   | Supplier                     |
+|            v                            v                              |
+|   +------------------+         +------------------+                    |
+|   |                  |         |                  |                    |
+|   |  MEASUREMENT &   |-------->|  ALGORITHM       |                    |
+|   |  OBSERVATION     |Supplier |  EXECUTION       |                    |
+|   |                  |Customer |                  |                    |
+|   |  Measures states |         |  Runs VQE/QAOA/  |                    |
+|   |  Extracts syndr. |         |  Grover/QEC      |                    |
+|   +--------+---------+         +--------+---------+                    |
+|            |                            |                              |
+|            +------------+---------------+                              |
+|                         |                                              |
+|                         v                                              |
+|            +------------------+         +------------------+           |
+|            |                  |         |                  |           |
+|            |  OPTIMIZATION &  |         |  DEPLOYMENT &    |           |
+|            |  BACKEND         |         |  INTEGRATION     |           |
+|            |                  |         |                  |           |
+|            |  SIMD, fusion,   |         |  WASM, agents,   |           |
+|            |  tensor networks |         |  ruQu bridge     |           |
+|            +------------------+         +--------+---------+           |
+|                                                  |                     |
+|                                    Conformist    | Anti-Corruption     |
+|                                    (ruVector     | Layer               |
+|                                     APIs)        | (ruQu decoder)     |
+|                                                  |                     |
++--------------------------------------------------+---------------------+
+                                                   |
+                                                   v
+                                     [Existing ruVector Ecosystem]
+
+Context Relationships:
+  <-------> Shared Kernel (shared types across boundary)
+  -------> Customer-Supplier (downstream depends on upstream)
+  Conformist: Deployment conforms to existing ruVector APIs
+  ACL: CoherenceBridge wraps ruQu decoder behind anti-corruption layer
+  Published Language: OpenQASM 3.0 for circuit interchange
+  Open Host Service: ruqu-wasm exposes JS API
+```
+
+### Relationship Summary
+
+| Upstream | Downstream | Pattern | Shared Types |
+|----------|------------|---------|-------------|
+| Circuit Construction | State Management | **Shared Kernel** | `Gate`, `QubitIndex`, `GateMatrix` |
+| Measurement & Observation | Algorithm Execution | **Customer-Supplier** | `MeasurementOutcome`, `ExpectationValue` |
+| State Management | Algorithm Execution | **Customer-Supplier** | `QuantumState`, `StateCheckpoint` |
+| State Management | Measurement & Observation | **Customer-Supplier** | `QuantumState`, `Amplitude` |
+| Optimization & Backend | Core Simulation | **Partnership** | `FusedGateMatrix`, `OptimizationHint` |
+| Existing ruVector APIs | Deployment & Integration | **Conformist** | ruVector event types, metric types |
+| ruQu decoder API | Deployment & Integration | **Anti-Corruption Layer** | Isolated behind `CoherenceBridge` |
+| Circuit Construction | External tools | **Published Language** | OpenQASM 3.0 circuit format |
+| Deployment & Integration | JS consumers | **Open Host Service** | `ruqu-wasm` JS API |
+
+---
+
+## Ubiquitous Language
+
+### Quantum Fundamentals
+
+| Term | Definition |
+|------|------------|
+| **Qubit** | Fundamental unit of quantum information existing in superposition of |0> and |1> basis states |
+| **Amplitude** | Complex number representing probability amplitude of a basis state; measurement probability is its squared modulus |
+| **State Vector** | Array of 2^n complex amplitudes representing the full quantum state of an n-qubit register |
+| **Basis State** | One of 2^n classical bit-string configurations; each has an associated amplitude |
+| **Superposition** | State where multiple basis states have nonzero amplitude |
+| **Entanglement** | Quantum correlation preventing independent per-qubit factorization of the joint state |
+| **Born Rule** | Measurement probability equals squared modulus of amplitude: P(x) = |alpha_x|^2 |
+
+### Circuit Model
+
+| Term | Definition |
+|------|------------|
+| **Gate** | Unitary matrix operation acting on 1 or 2 qubits; transforms state via matrix-vector multiply |
+| **Circuit** | Ordered sequence of gates applied to a qubit register; the program of a quantum computation |
+| **Gate Matrix** | Unitary matrix defining gate action; must satisfy U * U_dagger = I |
+| **Qubit Index** | Zero-based integer identifying a qubit; determines which amplitude pairs a gate addresses |
+| **Circuit Depth** | Maximum sequential gate layers; primary determinant of simulation time |
+| **Parameterized Gate** | Gate whose matrix depends on continuous real parameters (e.g., Ry(theta)) |
+| **Gate Fusion** | Combining adjacent gates on same qubits into a single matrix multiply |
+| **Gate Schedule** | Topologically sorted gate-to-timestep assignment respecting qubit-sharing constraints |
+
+### Measurement & Algorithms
+
+| Term | Definition |
+|------|------------|
+| **Measurement** | Projective observation collapsing superposition to a basis state per the Born rule |
+| **Mid-Circuit Measurement** | Measurement during (not only at end of) circuit execution |
+| **Shot** | Single circuit execution + measurement; repeated shots build statistics |
+| **Expectation Value** | Observable average over quantum state: <psi|H|psi> |
+| **Pauli String** | Tensor product of per-qubit Pauli operators (I/X/Y/Z) with coefficient |
+| **Hamiltonian** | Hermitian operator (weighted sum of Pauli strings) representing total energy |
+| **Syndrome** | Classical bits from ancilla measurements indicating error presence and location |
+| **Ansatz** | Parameterized circuit template encoding the variational search space |
+| **VQE** | Variational Quantum Eigensolver; iteratively minimizes Hamiltonian expectation |
+| **QAOA** | Quantum Approximate Optimization Algorithm; alternating cost/mixer unitaries |
+| **Grover Search** | Amplitude amplification finding marked items in O(sqrt(N)) queries |
+| **Oracle** | Black-box gate marking target states by phase flip |
+| **Surface Code** | 2D topological QEC code with stabilizer checks on lattice faces/vertices |
+| **Logical Error Rate** | Undetected logical error probability per QEC cycle |
+| **Decoder** | Classical algorithm mapping syndromes to corrections; bridge to ruQu |
+
+### Simulation Infrastructure
+
+| Term | Definition |
+|------|------------|
+| **State Allocator** | On-demand allocation/deallocation enforcing zero-idle policy |
+| **Memory Estimate** | Predicted bytes: 2^n * 16; gating threshold for allocation |
+| **Entanglement Tracker** | Tracks qubit correlations enabling subsystem splitting |
+| **State Checkpoint** | Serialized state snapshot for mid-circuit save/restore |
+| **Tensor Network** | Alternative representation via contracted tensor factors; efficient for low entanglement |
+| **Contraction Path** | Tensor contraction order minimizing total FLOPs |
+
+---
+
+## Bounded Context Details
+
+### Context 1: Circuit Construction Domain
+
+**Purpose**: Language for expressing quantum computations. Validation, scheduling, optimization, OpenQASM interchange.
+
+| Entity / Value Object | Type | Responsibility |
+|----------------------|------|---------------|
+| **QuantumCircuit** | Aggregate Root | Ordered gate collection with register metadata |
+| **Gate** | Entity | Single unitary with target qubits and optional parameters |
+| **GateSchedule** | Entity | Time-step assignment for parallel execution analysis |
+| **CircuitOptimizer** | Domain Service | Fusion, cancellation, and commutation rules |
+| GateId, QubitIndex, GateMatrix, ParameterBinding, GateType | Value Objects | Immutable circuit building blocks |
+
+**Events**: `CircuitCreated`, `GateAppended`, `CircuitOptimized`, `CircuitValidated`, `ParametersBound`
+
+**Invariants**: (1) Gate unitarity. (2) Qubit indices within bounds. (3) No duplicate targets per gate. (4) All parameters bound before execution.
+
+---
+
+### Context 2: State Management Domain
+
+**Purpose**: State vector lifecycle following zero-idle model. Entanglement tracking. Memory gating.
+
+| Entity / Value Object | Type | Responsibility |
+|----------------------|------|---------------|
+| **QuantumState** | Aggregate Root | Owns the 2^n complex amplitude array |
+| **EntanglementTracker** | Entity | Bipartite entanglement graph for subsystem analysis |
+| **StateAllocator** | Domain Service | On-demand allocation, immediate deallocation |
+| Amplitude, QubitCount, MemoryEstimate, StateCheckpoint | Value Objects | State representation primitives |
+
+**Events**: `StateAllocated`, `StateDeallocated`, `EntanglementDetected`, `SubsystemSplit`, `CheckpointCreated`, `MemoryLimitExceeded`
+
+**Invariants**: (1) Normalization preserved. (2) Zero-idle: no state persists beyond execution. (3) Allocation gated by device capacity. (4) Checkpoint restore reproduces exact amplitudes.
+
+---
+
+### Context 3: Measurement & Observation Domain
+
+**Purpose**: Projective measurement with collapse. Analytical expectation values. Syndrome extraction for QEC.
+
+| Entity / Value Object | Type | Responsibility |
+|----------------------|------|---------------|
+| **MeasurementEngine** | Aggregate Root | Born-rule sampling and state collapse |
+| **ExpectationCalculator** | Entity | Analytical <psi|H|psi> from Pauli decomposition |
+| **SyndromeExtractor** | Entity | Ancilla measurement and classical bit extraction |
+| MeasurementOutcome, PauliString, Hamiltonian, SyndromeBits, ShotResult | Value Objects | Measurement data types |
+
+**Events**: `MeasurementPerformed`, `ExpectationComputed`, `SyndromeExtracted`, `ShotsCompleted`
+
+**Invariants**: (1) Born rule: probabilities sum to 1.0. (2) Post-measurement collapse to definite state. (3) Hamiltonian Hermiticity. (4) Syndrome bit count matches code.
+
+---
+
+### Context 4: Algorithm Execution Domain
+
+**Purpose**: High-level quantum algorithms as orchestrated loops over circuits, states, and measurements.
+
+| Entity / Value Object | Type | Responsibility |
+|----------------------|------|---------------|
+| **VQERunner** | Entity | Iterative ansatz parameter optimization to minimize energy |
+| **GroverSearch** | Entity | Oracle + diffusion with auto-computed iteration count |
+| **QAOASolver** | Entity | Graph-based cost/mixer circuit construction and angle optimization |
+| **SurfaceCodeSimulator** | Entity | Stabilizer cycles, syndrome extraction, decoder invocation |
+| AlgorithmResult, OptimizationTrace, CutValue, LogicalErrorRate, ConvergenceCriteria | Value Objects | Algorithm output types |
+
+**Events**: `VQEIterationCompleted`, `VQEConverged`, `GroverSearchCompleted`, `QAOARoundCompleted`, `SurfaceCodeCycleCompleted`, `LogicalErrorDetected`
+
+**Invariants**: (1) Grover iteration count = floor(pi/4 * sqrt(N/M)). (2) VQE energy is upper bound on ground state. (3) QAOA cost/mixer alternate with correct parameter count. (4) Surface code distance matches lattice.
+
+---
+
+### Context 5: Optimization & Backend Domain
+
+**Purpose**: Performance backends that accelerate simulation without altering semantics. SIMD, fusion, tensor networks.
+
+| Entity / Value Object | Type | Responsibility |
+|----------------------|------|---------------|
+| **SimulationBackend** | Aggregate Root | Selects optimal execution strategy |
+| **GateFuser** | Entity | Combines compatible gate sequences into single operations |
+| **TensorContractor** | Entity | Tensor network decomposition for low-entanglement states |
+| **SIMDDispatcher** | Entity | Platform detection and optimized kernel dispatch |
+| OptimizationHint, ContractionPath, FusedGateMatrix, PlatformCapabilities | Value Objects | Backend selection metadata |
+
+**Events**: `BackendSelected`, `GatesFused`, `TensorNetworkContracted`, `SIMDKernelDispatched`
+
+**Invariants**: (1) Fused gates produce identical results to sequential. (2) Tensor contraction matches state-vector. (3) SIMD falls back to scalar if unavailable. (4) Intermediates stay within memory budget.
+
+---
+
+### Context 6: Deployment & Integration Domain
+
+**Purpose**: WASM compilation, agent activation bridge, ruQu decoder anti-corruption layer, observability.
+
+| Entity / Value Object | Type | Responsibility |
+|----------------------|------|---------------|
+| **WASMBindings** | Entity | Open Host Service via wasm-bindgen JS API |
+| **AgentBridge** | Entity | ruvector-nervous-system integration for context-triggered activation |
+| **MetricsReporter** | Entity | Publishes SimulationMetrics to ruvector-metrics |
+| **CoherenceBridge** | Entity | ACL translating syndromes to ruQu's DetectorBitmap/SyndromeRound |
+| PlatformCapabilities, QubitLimit, SimulationMetrics, DecoderResult | Value Objects | Integration data types |
+
+**Events**: `SimulationRequested`, `SimulationCompleted`, `ResourcesReleased`, `DecoderInvoked`, `MetricsPublished`
+
+**Integration Patterns**:
+- **Anti-Corruption Layer**: CoherenceBridge isolates engine from ruQu's internal DDD model
+- **Conformist**: Deployment conforms to existing ruVector event types and metric schemas
+- **Open Host Service**: ruqu-wasm exposes clean JS/TS API for browser experimentation
+- **Published Language**: OpenQASM 3.0 for circuit interchange with external tools
+
+---
+
+## Cross-Cutting Concerns
+
+### Zero-Idle Resource Model
+
+```
+IDLE (0 bytes) --> ACTIVATE (allocate 2^n * 16 bytes) --> COMPUTE --> RELEASE (0 bytes)
+```
+
+No warm pools, no pre-allocated buffers, no background threads.
+
+### Memory Gating
+
+| Qubits | State Vector Size | Decision |
+|--------|-------------------|----------|
+| 10 | 16 KiB | Always permit |
+| 15 | 512 KiB | Always permit |
+| 20 | 16 MiB | Permit on most devices |
+| 25 | 512 MiB | Gate: check available RAM |
+| 30 | 16 GiB | Gate: likely refuse on edge |
+| 35+ | 512 GiB+ | Always refuse (state vector); consider tensor network |
+
+### Error Model
+
+| Context | Error | Severity | Recovery |
+|---------|-------|----------|----------|
+| Circuit Construction | Non-unitary gate | Fatal | Reject circuit |
+| State Management | Memory limit exceeded | Recoverable | Try tensor network or refuse |
+| State Management | Normalization drift | Warning | Renormalize |
+| Measurement | Zero-probability outcome | Warning | Return uniform |
+| Algorithm Execution | VQE non-convergence | Recoverable | Return best-so-far |
+| Deployment | WASM memory limit | Fatal | Report to agent |
+| Deployment | ruQu decoder unavailable | Recoverable | Skip correction, log |
+
+### Observability
+
+All simulation runs produce `SimulationMetrics` (circuit name, qubit count, gate count, depth, shots, backend type, wall time, peak memory, SIMD utilization) flowing through `ruvector-metrics` for unified dashboard integration.
+
+### Security
+
+| Concern | Mitigation |
+|---------|------------|
+| Timing side channels in measurement | Constant-time sampling via rejection method |
+| Memory contents after deallocation | Zero-fill on deallocation (SecureAllocator mode) |
+| Denial-of-service via large qubit counts | Memory gating with hard upper bound per request |
+| Untrusted OpenQASM input | Parser validates unitarity and qubit bounds before execution |
+| WASM sandbox escape | No file I/O, no network; pure computation within WASM sandbox |
+
+---
+
+## Module Structure
+
+```
+crates/ruqu-sim/src/
++-- lib.rs                     # Public API
++-- circuit/                   # Circuit Construction context
+|   +-- quantum_circuit.rs     # QuantumCircuit aggregate
+|   +-- gate.rs                # Gate entity, GateType enum
+|   +-- schedule.rs            # GateSchedule
+|   +-- optimizer.rs           # CircuitOptimizer (fusion, cancel)
+|   +-- openqasm.rs            # OpenQASM 3.0 import/export
++-- state/                     # State Management context
+|   +-- quantum_state.rs       # QuantumState aggregate
+|   +-- allocator.rs           # StateAllocator (zero-idle)
+|   +-- entanglement.rs        # EntanglementTracker
+|   +-- checkpoint.rs          # StateCheckpoint
++-- measurement/               # Measurement & Observation context
+|   +-- engine.rs              # MeasurementEngine
+|   +-- expectation.rs         # ExpectationCalculator
+|   +-- syndrome.rs            # SyndromeExtractor
++-- algorithms/                # Algorithm Execution context
+|   +-- vqe.rs, grover.rs      # VQERunner, GroverSearch
+|   +-- qaoa.rs                # QAOASolver
+|   +-- surface_code.rs        # SurfaceCodeSimulator
++-- backend/                   # Optimization & Backend context
+|   +-- simulation_backend.rs  # SimulationBackend
+|   +-- gate_fuser.rs          # GateFuser
+|   +-- tensor_network.rs      # TensorContractor
+|   +-- simd_dispatch.rs       # SIMDDispatcher
+|   +-- kernels/               # avx2.rs, avx512.rs, neon.rs, wasm_simd.rs, scalar.rs
++-- types.rs, events.rs, error.rs
+
+crates/ruqu-wasm/src/
++-- lib.rs                     # wasm-bindgen entry
++-- js_api.rs                  # JS-facing API
++-- agent_bridge.rs            # ruvector-nervous-system integration
++-- coherence_bridge.rs        # ACL for ruQu decoder
++-- metrics.rs                 # ruvector-metrics export
+```
+
+### Dependency Graph
+
+```
+ruqu-sim
++-- ruvector-math           (SIMD kernels, complex math)
++-- rand                    (measurement sampling)
++-- ruvector-graph          (QAOA graph input)
+
+ruqu-wasm
++-- ruqu-sim                (core simulation)
++-- ruqu                    (coherence bridge ACL)
++-- ruvector-metrics        (observability)
++-- ruvector-nervous-system (agent activation)
++-- wasm-bindgen            (JS bindings)
+```
+
+---
+
+## Performance Targets
+
+| Metric | Target |
+|--------|--------|
+| Single-gate (1q, 20-qubit register) | < 50 us |
+| Full circuit (100 gates, 15 qubits) | < 10 ms |
+| Hamiltonian expectation (10q, 50 terms) | < 1 ms |
+| SIMD speedup over scalar | > 3x (AVX2), > 6x (AVX-512) |
+| Grover (20 qubits, 1 target) | < 500 ms |
+| VQE convergence (H2, 4 qubits) | < 5s, < 100 iterations |
+| State allocation/deallocation | < 10 us / < 1 us |
+| WASM circuit (10 qubits, 50 gates) | < 50 ms |
+
+---
+
+## References
+
+1. Evans, E. (2003). "Domain-Driven Design: Tackling Complexity in the Heart of Software."
+2. Vernon, V. (2013). "Implementing Domain-Driven Design."
+3. Nielsen, M. A. & Chuang, I. L. (2010). "Quantum Computation and Quantum Information."
+4. Peruzzo, A. et al. (2014). "A variational eigenvalue solver on a photonic quantum processor."
+5. Farhi, E. et al. (2014). "A Quantum Approximate Optimization Algorithm."
+6. Fowler, A. G. et al. (2012). "Surface codes: Towards practical large-scale quantum computation."
+7. ruQu crate: Existing coherence assessment and syndrome processing in ruVector.
+8. Coherence Engine DDD: `/docs/architecture/coherence-engine-ddd.md`
diff --git a/docs/architecture/quantum-engine/quantum-engine-ddd-tactical.md b/docs/architecture/quantum-engine/quantum-engine-ddd-tactical.md
new file mode 100644
index 00000000..93623c50
--- /dev/null
+++ b/docs/architecture/quantum-engine/quantum-engine-ddd-tactical.md
@@ -0,0 +1,1426 @@
+# Quantum Simulation Engine: Domain-Driven Design - Tactical Design
+
+**Version**: 0.1
+**Date**: 2026-02-06
+**Status**: Draft
+
+---
+
+## Overview
+
+This document defines the tactical DDD patterns for the ruVector Quantum Simulation Engine (`ruqu-core`, `ruqu-algorithms`, `ruqu-wasm`). It specifies the Aggregates, Entities, Value Objects, Domain Events, Repositories, and Domain Services that compose the simulation domain. All type signatures target Rust and align with the conventions established in the existing `ruqu` crate and the coherence engine DDD.
+
+---
+
+## Value Objects
+
+Value Objects are immutable, identity-less types compared by structural equality. They form the mathematical vocabulary of the quantum simulation domain.
+
+### Qubit and Gate Primitives
+
+```rust
+/// Immutable qubit identifier. Valid range: 0..qubit_count.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub struct QubitIndex(pub u32);
+
+/// Single complex amplitude for one basis state.
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub struct Amplitude(pub Complex<f64>);
+
+/// 2x2 unitary matrix for a single-qubit gate.
+#[derive(Debug, Clone, PartialEq)]
+pub struct GateMatrix(pub [[Complex<f64>; 2]; 2]);
+
+/// 4x4 unitary matrix for a two-qubit gate.
+#[derive(Debug, Clone, PartialEq)]
+pub struct TwoQubitMatrix(pub [[Complex<f64>; 4]; 4]);
+
+/// Individual Pauli operator.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum PauliOp { I, X, Y, Z }
+
+/// Tensor product of Pauli operators acting on consecutive qubits.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct PauliString(pub Vec<PauliOp>);
+
+/// Weighted sum of Pauli strings representing an observable.
+#[derive(Debug, Clone, PartialEq)]
+pub struct Hamiltonian(pub Vec<(f64, PauliString)>);
+```
+
+### Measurement and Outcome Types
+
+```rust
+/// Outcome of measuring a single qubit.
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub struct MeasurementOutcome {
+    pub qubit: QubitIndex,
+    pub result: bool,
+    pub probability: f64,
+}
+
+/// Classical syndrome extracted from a QEC cycle.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct SyndromeBits(pub Vec<bool>);
+
+/// Named parameter binding for parametric circuits.
+#[derive(Debug, Clone, PartialEq)]
+pub struct ParameterBinding {
+    pub name: String,
+    pub value: f64,
+}
+```
+
+### Metrics and Resource Types
+
+```rust
+/// Longest path through the circuit DAG.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub struct CircuitDepth(pub u32);
+
+/// Gate fidelity score in the range [0.0, 1.0].
+#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
+pub struct GateFidelity(pub f64);
+
+/// Error probability per gate or per QEC cycle.
+#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
+pub struct NoiseRate(pub f64);
+
+/// QAOA objective function value for a MaxCut instance.
+#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
+pub struct CutValue(pub f64);
+
+/// Logical error rate measured as errors per QEC round.
+#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
+pub struct LogicalErrorRate(pub f64);
+
+/// Memory required in bytes for a given simulation configuration.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub struct MemoryEstimate(pub usize);
+
+/// Maximum qubit count supported by the current platform/backend.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub struct QubitLimit(pub u32);
+
+/// Estimated floating-point operations for a tensor contraction.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub struct ContractionCost(pub u64);
+
+/// MPS bond dimension controlling truncation fidelity.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub struct BondDimension(pub u32);
+
+/// Aggregate simulation performance metrics (immutable snapshot).
+#[derive(Debug, Clone, PartialEq)]
+pub struct SimulationMetrics {
+    pub qubits: u32,
+    pub gates: u32,
+    pub time_ms: f64,
+    pub peak_memory: usize,
+    pub gates_per_sec: f64,
+}
+
+/// Coherence score bridged from the ruQu coherence engine.
+#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
+pub struct CoherenceScore(pub f64);
+```
+
+### Invariant Enforcement
+
+All value objects enforce their invariants at construction time:
+
+```rust
+impl GateFidelity {
+    pub fn new(value: f64) -> Result<Self, DomainError> {
+        if !(0.0..=1.0).contains(&value) {
+            return Err(DomainError::InvalidFidelity(value));
+        }
+        Ok(Self(value))
+    }
+}
+
+impl NoiseRate {
+    pub fn new(value: f64) -> Result<Self, DomainError> {
+        if !(0.0..=1.0).contains(&value) {
+            return Err(DomainError::InvalidNoiseRate(value));
+        }
+        Ok(Self(value))
+    }
+}
+
+impl QubitIndex {
+    pub fn validate(self, qubit_count: u32) -> Result<(), DomainError> {
+        if self.0 >= qubit_count {
+            return Err(DomainError::QubitIndexOutOfRange {
+                index: self,
+                qubit_count,
+            });
+        }
+        Ok(())
+    }
+}
+```
+
+---
+
+## Aggregates
+
+### 1. QuantumCircuit Aggregate
+
+**Root Entity**: `QuantumCircuit`
+**Contains**: `Vec<Gate>`, `qubit_count: u32`, `parameter_bindings: HashMap<String, f64>`
+
+**Invariants**:
+- All gate qubit indices reference valid qubits (0..qubit_count)
+- Parameter names are unique across the circuit
+- Circuit scheduling is acyclic (gates on the same qubits are totally ordered)
+- Gate count does not exceed platform-specific limits
+
+**Factory**: `CircuitBuilder` with fluent API.
+
+```rust
+/// A quantum circuit: the central description of a quantum computation.
+pub struct QuantumCircuit {
+    id: CircuitId,
+    qubit_count: u32,
+    gates: Vec<Gate>,
+    parameter_bindings: HashMap<String, f64>,
+    metadata: CircuitMetadata,
+}
+
+/// A single gate operation within a circuit.
+#[derive(Debug, Clone)]
+pub struct Gate {
+    pub gate_type: GateType,
+    pub target: QubitIndex,
+    pub control: Option<QubitIndex>,
+    pub matrix: GateMatrix,
+    pub parameter: Option<String>,
+}
+
+/// Supported gate types.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum GateType {
+    H, X, Y, Z, S, T, Rx, Ry, Rz, CNOT, CZ, SWAP, Toffoli, Custom,
+}
+
+impl QuantumCircuit {
+    /// Add a gate, enforcing qubit index validity.
+    pub fn add_gate(&mut self, gate: Gate) -> Result<(), DomainError> {
+        gate.target.validate(self.qubit_count)?;
+        if let Some(ctrl) = gate.control {
+            ctrl.validate(self.qubit_count)?;
+            if ctrl == gate.target {
+                return Err(DomainError::SelfControlGate);
+            }
+        }
+        self.gates.push(gate);
+        Ok(())
+    }
+
+    /// Bind a parameter value by name.
+    pub fn set_parameter(&mut self, name: &str, value: f64) -> Result<(), DomainError> {
+        if !self.has_parameter(name) {
+            return Err(DomainError::UnknownParameter(name.to_string()));
+        }
+        self.parameter_bindings.insert(name.to_string(), value);
+        Ok(())
+    }
+
+    /// Return circuit depth as the longest gate chain.
+    pub fn depth(&self) -> CircuitDepth {
+        CircuitDepth(self.compute_dag_depth())
+    }
+
+    /// Apply gate fusion optimization, combining adjacent single-qubit gates.
+    pub fn fuse_gates(&mut self) -> usize {
+        let original_count = self.gates.len();
+        self.gates = GateFusionService::fuse(&self.gates);
+        original_count - self.gates.len()
+    }
+
+    /// Full optimization pass: fusion, cancellation, commutation.
+    pub fn optimize(&mut self) -> OptimizationReport {
+        let before = self.gates.len();
+        self.fuse_gates();
+        // Additional passes: gate cancellation, commutation reordering
+        OptimizationReport {
+            gates_before: before as u32,
+            gates_after: self.gates.len() as u32,
+            depth_before: CircuitDepth(0), // placeholder
+            depth_after: self.depth(),
+        }
+    }
+
+    fn has_parameter(&self, name: &str) -> bool {
+        self.gates.iter().any(|g| g.parameter.as_deref() == Some(name))
+    }
+
+    fn compute_dag_depth(&self) -> u32 {
+        // DAG-based depth computation tracking per-qubit depth
+        let mut qubit_depth = vec![0u32; self.qubit_count as usize];
+        for gate in &self.gates {
+            let t = gate.target.0 as usize;
+            let d = if let Some(c) = gate.control {
+                qubit_depth[t].max(qubit_depth[c.0 as usize]) + 1
+            } else {
+                qubit_depth[t] + 1
+            };
+            qubit_depth[t] = d;
+            if let Some(c) = gate.control {
+                qubit_depth[c.0 as usize] = d;
+            }
+        }
+        qubit_depth.into_iter().max().unwrap_or(0)
+    }
+}
+
+/// Fluent builder for constructing circuits.
+pub struct CircuitBuilder {
+    qubit_count: u32,
+    gates: Vec<Gate>,
+}
+
+impl CircuitBuilder {
+    pub fn new(qubit_count: u32) -> Self {
+        Self { qubit_count, gates: Vec::new() }
+    }
+
+    pub fn h(mut self, target: u32) -> Self {
+        self.gates.push(Gate::hadamard(QubitIndex(target)));
+        self
+    }
+
+    pub fn cx(mut self, control: u32, target: u32) -> Self {
+        self.gates.push(Gate::cnot(QubitIndex(control), QubitIndex(target)));
+        self
+    }
+
+    pub fn rz(mut self, target: u32, param_name: &str) -> Self {
+        self.gates.push(Gate::rz(QubitIndex(target), param_name.to_string()));
+        self
+    }
+
+    pub fn build(self) -> Result<QuantumCircuit, DomainError> {
+        let mut circuit = QuantumCircuit {
+            id: CircuitId::new(),
+            qubit_count: self.qubit_count,
+            gates: Vec::new(),
+            parameter_bindings: HashMap::new(),
+            metadata: CircuitMetadata::default(),
+        };
+        for gate in self.gates {
+            circuit.add_gate(gate)?;
+        }
+        Ok(circuit)
+    }
+}
+```
+
+---
+
+### 2. QuantumState Aggregate
+
+**Root Entity**: `QuantumState`
+**Contains**: `state_vector: Vec<Complex<f64>>`, `qubit_count: u32`, `entanglement_map: EntanglementMap`
+
+**Invariants**:
+- State vector is normalized: sum of |amplitude|^2 = 1.0 (within epsilon)
+- `qubit_count` matches `log2(state_vector.len())`
+- State vector length is always a power of two
+
+**Factory**: `QuantumState::new(n)` initializes the |00...0> computational basis state.
+
+```rust
+/// Full state vector representation of a quantum register.
+pub struct QuantumState {
+    state_vector: Vec<Complex<f64>>,
+    qubit_count: u32,
+    entanglement_map: EntanglementMap,
+}
+
+/// Tracks pairwise entanglement between qubits.
+#[derive(Debug, Clone, Default)]
+pub struct EntanglementMap {
+    /// Adjacency set: (qubit_a, qubit_b) pairs known to be entangled.
+    pairs: HashSet<(QubitIndex, QubitIndex)>,
+}
+
+impl QuantumState {
+    /// Initialize |00...0> state for n qubits.
+    pub fn new(qubit_count: u32) -> Result<Self, DomainError> {
+        if qubit_count > 30 {
+            return Err(DomainError::QubitLimitExceeded {
+                requested: qubit_count,
+                limit: QubitLimit(30),
+            });
+        }
+        let dim = 1usize << qubit_count;
+        let mut sv = vec![Complex::new(0.0, 0.0); dim];
+        sv[0] = Complex::new(1.0, 0.0);
+        Ok(Self {
+            state_vector: sv,
+            qubit_count,
+            entanglement_map: EntanglementMap::default(),
+        })
+    }
+
+    /// Apply a single-qubit gate to the state.
+    pub fn apply_gate(&mut self, gate: &Gate) -> Result<(), DomainError> {
+        gate.target.validate(self.qubit_count)?;
+        match gate.control {
+            None => self.apply_single_qubit(gate.target, &gate.matrix),
+            Some(ctrl) => {
+                ctrl.validate(self.qubit_count)?;
+                self.apply_controlled(ctrl, gate.target, &gate.matrix);
+                self.entanglement_map.mark_entangled(ctrl, gate.target);
+            }
+        }
+        Ok(())
+    }
+
+    /// Measure a single qubit, collapsing the state.
+    pub fn measure(&mut self, qubit: QubitIndex) -> Result<MeasurementOutcome, DomainError> {
+        qubit.validate(self.qubit_count)?;
+        let prob_one = self.probability_of_one(qubit);
+        let result = rand::random::<f64>() < prob_one;
+        self.collapse(qubit, result);
+        self.renormalize();
+        Ok(MeasurementOutcome {
+            qubit,
+            result,
+            probability: if result { prob_one } else { 1.0 - prob_one },
+        })
+    }
+
+    /// Reset a qubit to |0>, disentangling it from the register.
+    pub fn reset_qubit(&mut self, qubit: QubitIndex) -> Result<(), DomainError> {
+        qubit.validate(self.qubit_count)?;
+        self.collapse(qubit, false);
+        self.renormalize();
+        self.entanglement_map.remove_qubit(qubit);
+        Ok(())
+    }
+
+    /// Compute expectation value <psi|H|psi> for a Hamiltonian.
+    pub fn expectation_value(&self, hamiltonian: &Hamiltonian) -> f64 {
+        hamiltonian.0.iter()
+            .map(|(coeff, pauli)| coeff * self.pauli_expectation(pauli))
+            .sum()
+    }
+
+    /// Verify normalization invariant.
+    pub fn is_normalized(&self, epsilon: f64) -> bool {
+        let norm_sq: f64 = self.state_vector.iter()
+            .map(|a| a.norm_sqr())
+            .sum();
+        (norm_sq - 1.0).abs() < epsilon
+    }
+
+    /// Memory estimate for this state.
+    pub fn memory_estimate(&self) -> MemoryEstimate {
+        MemoryEstimate(self.state_vector.len() * std::mem::size_of::<Complex<f64>>())
+    }
+
+    fn probability_of_one(&self, qubit: QubitIndex) -> f64 {
+        let mask = 1usize << qubit.0;
+        self.state_vector.iter().enumerate()
+            .filter(|(i, _)| i & mask != 0)
+            .map(|(_, a)| a.norm_sqr())
+            .sum()
+    }
+
+    fn collapse(&mut self, qubit: QubitIndex, to_one: bool) {
+        let mask = 1usize << qubit.0;
+        for (i, amp) in self.state_vector.iter_mut().enumerate() {
+            let is_one = (i & mask) != 0;
+            if is_one != to_one {
+                *amp = Complex::new(0.0, 0.0);
+            }
+        }
+    }
+
+    fn renormalize(&mut self) {
+        let norm: f64 = self.state_vector.iter().map(|a| a.norm_sqr()).sum::<f64>().sqrt();
+        if norm > 1e-15 {
+            for amp in &mut self.state_vector {
+                *amp /= norm;
+            }
+        }
+    }
+
+    fn apply_single_qubit(&mut self, target: QubitIndex, matrix: &GateMatrix) { /* ... */ }
+    fn apply_controlled(&mut self, ctrl: QubitIndex, target: QubitIndex, matrix: &GateMatrix) { /* ... */ }
+    fn pauli_expectation(&self, pauli: &PauliString) -> f64 { /* ... */ 0.0 }
+}
+```
+
+---
+
+### 3. SimulationSession Aggregate
+
+**Root Entity**: `SimulationSession`
+**Contains**: `circuit: QuantumCircuit`, `state: QuantumState`, `backend_config: BackendConfig`, `metrics: Vec<SimulationMetrics>`, `measurement_record: Vec<MeasurementOutcome>`
+
+**Invariants**:
+- Session lifecycle is linear: Created -> Running -> Completed | Failed
+- Resources (state vector memory) are allocated only during the Running state
+- Circuit qubit count matches state qubit count
+- Backend configuration is immutable after the session enters Running
+
+```rust
+/// Lifecycle states for a simulation session.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum SessionStatus {
+    Created,
+    Running,
+    Completed,
+    Failed,
+}
+
+/// Configuration for the simulation backend.
+#[derive(Debug, Clone)]
+pub struct BackendConfig {
+    pub backend_type: BackendType,
+    pub max_memory_bytes: usize,
+    pub thread_count: usize,
+    pub seed: Option<u64>,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum BackendType {
+    StateVector,
+    TensorNetwork,
+    Stabilizer,
+}
+
+/// A single simulation execution session.
+pub struct SimulationSession {
+    id: SessionId,
+    circuit: QuantumCircuit,
+    state: Option<QuantumState>,
+    backend_config: BackendConfig,
+    status: SessionStatus,
+    metrics: Vec<SimulationMetrics>,
+    measurement_record: Vec<MeasurementOutcome>,
+    step_index: usize,
+    events: Vec<DomainEvent>,
+}
+
+impl SimulationSession {
+    pub fn new(circuit: QuantumCircuit, config: BackendConfig) -> Self {
+        Self {
+            id: SessionId::new(),
+            circuit,
+            state: None,
+            backend_config: config,
+            status: SessionStatus::Created,
+            metrics: Vec::new(),
+            measurement_record: Vec::new(),
+            step_index: 0,
+            events: Vec::new(),
+        }
+    }
+
+    /// Transition Created -> Running. Allocates state vector.
+    pub fn start(&mut self) -> Result<(), DomainError> {
+        if self.status != SessionStatus::Created {
+            return Err(DomainError::InvalidSessionTransition {
+                from: self.status,
+                to: SessionStatus::Running,
+            });
+        }
+        let state = QuantumState::new(self.circuit.qubit_count)?;
+        self.state = Some(state);
+        self.status = SessionStatus::Running;
+        self.events.push(DomainEvent::SimulationStarted {
+            session_id: self.id,
+            qubits: self.circuit.qubit_count,
+        });
+        Ok(())
+    }
+
+    /// Apply the next gate in the circuit, returning any measurement.
+    pub fn step(&mut self) -> Result<Option<MeasurementOutcome>, DomainError> {
+        self.assert_running()?;
+        let state = self.state.as_mut().unwrap();
+        if self.step_index >= self.circuit.gates.len() {
+            self.status = SessionStatus::Completed;
+            self.events.push(DomainEvent::SimulationCompleted {
+                session_id: self.id,
+                total_gates: self.circuit.gates.len() as u32,
+            });
+            return Ok(None);
+        }
+        let gate = &self.circuit.gates[self.step_index];
+        state.apply_gate(gate)?;
+        self.step_index += 1;
+        self.events.push(DomainEvent::GateApplied {
+            session_id: self.id,
+            gate_index: self.step_index as u32 - 1,
+        });
+        Ok(None)
+    }
+
+    /// Run all remaining gates to completion.
+    pub fn run_to_completion(&mut self) -> Result<SimulationMetrics, DomainError> {
+        self.assert_running()?;
+        let start = std::time::Instant::now();
+        while self.status == SessionStatus::Running {
+            self.step()?;
+        }
+        let elapsed = start.elapsed();
+        let total_gates = self.circuit.gates.len() as u32;
+        let metrics = SimulationMetrics {
+            qubits: self.circuit.qubit_count,
+            gates: total_gates,
+            time_ms: elapsed.as_secs_f64() * 1000.0,
+            peak_memory: self.state.as_ref().map(|s| s.memory_estimate().0).unwrap_or(0),
+            gates_per_sec: total_gates as f64 / elapsed.as_secs_f64(),
+        };
+        self.metrics.push(metrics.clone());
+        Ok(metrics)
+    }
+
+    /// Abort a running session, transitioning to Failed.
+    pub fn abort(&mut self, reason: &str) -> Result<(), DomainError> {
+        self.assert_running()?;
+        self.status = SessionStatus::Failed;
+        self.state = None; // Release memory
+        self.events.push(DomainEvent::SimulationFailed {
+            session_id: self.id,
+            reason: reason.to_string(),
+        });
+        Ok(())
+    }
+
+    /// Drain pending domain events.
+    pub fn take_events(&mut self) -> Vec<DomainEvent> {
+        std::mem::take(&mut self.events)
+    }
+
+    fn assert_running(&self) -> Result<(), DomainError> {
+        if self.status != SessionStatus::Running {
+            return Err(DomainError::SessionNotRunning(self.status));
+        }
+        Ok(())
+    }
+}
+```
+
+---
+
+### 4. VQEOptimization Aggregate
+
+**Root Entity**: `VQEOptimization`
+**Contains**: `ansatz_circuit: QuantumCircuit`, `hamiltonian: Hamiltonian`, `optimizer_state: OptimizerState`, `iteration_history: Vec<VQEIteration>`
+
+**Invariants**:
+- Parameter count of the ansatz circuit matches the optimizer dimension
+- Energy values are real (imaginary part zero within tolerance)
+- Convergence criteria are checked after each iteration
+
+```rust
+/// A single VQE iteration record.
+#[derive(Debug, Clone)]
+pub struct VQEIteration {
+    pub iteration: u32,
+    pub parameters: Vec<f64>,
+    pub energy: f64,
+    pub gradient_norm: f64,
+}
+
+/// Classical optimizer state.
+pub struct OptimizerState {
+    pub method: OptimizerMethod,
+    pub learning_rate: f64,
+    pub momentum: Vec<f64>,
+    pub velocity: Vec<f64>,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum OptimizerMethod { GradientDescent, Adam, COBYLA, SPSA }
+
+/// VQE variational optimization session.
+pub struct VQEOptimization {
+    id: VQEId,
+    ansatz: QuantumCircuit,
+    hamiltonian: Hamiltonian,
+    optimizer: OptimizerState,
+    history: Vec<VQEIteration>,
+    convergence_threshold: f64,
+    max_iterations: u32,
+    events: Vec<DomainEvent>,
+}
+
+impl VQEOptimization {
+    pub fn new(
+        ansatz: QuantumCircuit,
+        hamiltonian: Hamiltonian,
+        method: OptimizerMethod,
+        convergence_threshold: f64,
+        max_iterations: u32,
+    ) -> Result<Self, DomainError> {
+        let param_count = ansatz.parameter_count();
+        Ok(Self {
+            id: VQEId::new(),
+            ansatz,
+            hamiltonian,
+            optimizer: OptimizerState::new(method, param_count),
+            history: Vec::new(),
+            convergence_threshold,
+            max_iterations,
+            events: Vec::new(),
+        })
+    }
+
+    /// Evaluate the energy at the current parameter values.
+    pub fn evaluate_energy(&self, params: &[f64]) -> Result<f64, DomainError> {
+        let mut circuit = self.ansatz.clone();
+        circuit.bind_all_parameters(params)?;
+        let mut state = QuantumState::new(circuit.qubit_count)?;
+        for gate in &circuit.gates {
+            state.apply_gate(gate)?;
+        }
+        Ok(state.expectation_value(&self.hamiltonian))
+    }
+
+    /// Compute the parameter gradient using the parameter-shift rule.
+    pub fn compute_gradient(&self, params: &[f64]) -> Result<Vec<f64>, DomainError> {
+        let shift = std::f64::consts::FRAC_PI_2;
+        let mut gradient = vec![0.0; params.len()];
+        for i in 0..params.len() {
+            let mut params_plus = params.to_vec();
+            let mut params_minus = params.to_vec();
+            params_plus[i] += shift;
+            params_minus[i] -= shift;
+            let e_plus = self.evaluate_energy(&params_plus)?;
+            let e_minus = self.evaluate_energy(&params_minus)?;
+            gradient[i] = (e_plus - e_minus) / 2.0;
+        }
+        Ok(gradient)
+    }
+
+    /// Run one optimization iteration.
+    pub fn iterate(&mut self) -> Result<VQEIteration, DomainError> {
+        let params = self.optimizer.current_parameters();
+        let energy = self.evaluate_energy(&params)?;
+        let gradient = self.compute_gradient(&params)?;
+        let grad_norm = gradient.iter().map(|g| g * g).sum::<f64>().sqrt();
+        self.optimizer.update(&gradient);
+        let iteration = VQEIteration {
+            iteration: self.history.len() as u32,
+            parameters: params,
+            energy,
+            gradient_norm: grad_norm,
+        };
+        self.history.push(iteration.clone());
+        self.events.push(DomainEvent::VQEIterationCompleted {
+            vqe_id: self.id,
+            iteration: iteration.iteration,
+            energy,
+        });
+        if let Some(prev) = self.history.iter().rev().nth(1) {
+            if prev.energy > energy {
+                self.events.push(DomainEvent::VQEEnergyImproved {
+                    vqe_id: self.id,
+                    previous: prev.energy,
+                    current: energy,
+                });
+            }
+        }
+        Ok(iteration)
+    }
+
+    /// Run iterations until convergence or max_iterations.
+    pub fn converge(&mut self) -> Result<f64, DomainError> {
+        for _ in 0..self.max_iterations {
+            let iter = self.iterate()?;
+            if iter.gradient_norm < self.convergence_threshold {
+                self.events.push(DomainEvent::VQEConverged {
+                    vqe_id: self.id,
+                    final_energy: iter.energy,
+                    iterations: iter.iteration,
+                });
+                return Ok(iter.energy);
+            }
+        }
+        Err(DomainError::VQEDidNotConverge {
+            iterations: self.max_iterations,
+        })
+    }
+}
+```
+
+---
+
+### 5. SurfaceCodeExperiment Aggregate
+
+**Root Entity**: `SurfaceCodeExperiment`
+**Contains**: `code_distance: u32`, `noise_model: NoiseModel`, `decoder: Box<dyn Decoder>`, `cycle_count: u32`, `error_log: Vec<ErrorEvent>`
+
+**Invariants**:
+- Decoder is compatible with the configured code distance
+- Noise parameters are within valid probability ranges [0.0, 1.0]
+- Cycle count advances monotonically
+
+```rust
+/// Noise model for stochastic error injection.
+#[derive(Debug, Clone)]
+pub struct NoiseModel {
+    pub depolarizing_rate: NoiseRate,
+    pub measurement_error_rate: NoiseRate,
+    pub idle_error_rate: NoiseRate,
+}
+
+/// Trait for QEC decoders.
+pub trait Decoder: Send {
+    fn decode(&self, syndrome: &SyndromeBits) -> Vec<PauliOp>;
+    fn code_distance(&self) -> u32;
+}
+
+/// An error event recorded during a QEC experiment.
+#[derive(Debug, Clone)]
+pub struct ErrorEvent {
+    pub cycle: u32,
+    pub error_type: ErrorType,
+    pub qubits: Vec<QubitIndex>,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ErrorType { DataError, MeasurementError, LogicalError }
+
+/// Surface code QEC experiment.
+pub struct SurfaceCodeExperiment {
+    id: ExperimentId,
+    code_distance: u32,
+    noise_model: NoiseModel,
+    decoder: Box<dyn Decoder>,
+    cycle_count: u32,
+    error_log: Vec<ErrorEvent>,
+    logical_error_count: u32,
+    events: Vec<DomainEvent>,
+}
+
+impl SurfaceCodeExperiment {
+    pub fn new(
+        code_distance: u32,
+        noise_model: NoiseModel,
+        decoder: Box<dyn Decoder>,
+    ) -> Result<Self, DomainError> {
+        if decoder.code_distance() != code_distance {
+            return Err(DomainError::DecoderDistanceMismatch {
+                decoder: decoder.code_distance(),
+                experiment: code_distance,
+            });
+        }
+        Ok(Self {
+            id: ExperimentId::new(),
+            code_distance,
+            noise_model,
+            decoder,
+            cycle_count: 0,
+            error_log: Vec::new(),
+            logical_error_count: 0,
+            events: Vec::new(),
+        })
+    }
+
+    /// Run one QEC cycle: inject errors, extract syndrome, decode, check logical.
+    pub fn run_cycle(&mut self) -> Result<CycleReport, DomainError> {
+        self.cycle_count += 1;
+        let errors = self.inject_errors();
+        let syndrome = self.extract_syndrome(&errors);
+        let correction = self.decode(&syndrome);
+        let logical_error = self.check_logical_error(&errors, &correction);
+        if logical_error {
+            self.logical_error_count += 1;
+            self.events.push(DomainEvent::LogicalErrorDetected {
+                experiment_id: self.id,
+                cycle: self.cycle_count,
+            });
+        }
+        self.events.push(DomainEvent::SurfaceCodeCycleCompleted {
+            experiment_id: self.id,
+            cycle: self.cycle_count,
+            syndrome_weight: syndrome.0.iter().filter(|&&b| b).count() as u32,
+        });
+        Ok(CycleReport {
+            cycle: self.cycle_count,
+            syndrome,
+            correction,
+            logical_error,
+        })
+    }
+
+    /// Inject stochastic errors based on the noise model.
+    pub fn inject_errors(&mut self) -> Vec<ErrorEvent> {
+        let mut errors = Vec::new();
+        let data_qubit_count = self.code_distance * self.code_distance;
+        for q in 0..data_qubit_count {
+            if rand::random::<f64>() < self.noise_model.depolarizing_rate.0 {
+                let event = ErrorEvent {
+                    cycle: self.cycle_count,
+                    error_type: ErrorType::DataError,
+                    qubits: vec![QubitIndex(q)],
+                };
+                self.error_log.push(event.clone());
+                errors.push(event);
+            }
+        }
+        errors
+    }
+
+    /// Extract syndrome bits from the current error configuration.
+    pub fn extract_syndrome(&self, errors: &[ErrorEvent]) -> SyndromeBits {
+        let stabilizer_count = 2 * (self.code_distance - 1) * self.code_distance;
+        let mut bits = vec![false; stabilizer_count as usize];
+        for error in errors {
+            for qubit in &error.qubits {
+                let affected = self.stabilizers_for_qubit(*qubit);
+                for s in affected {
+                    bits[s] ^= true;
+                }
+            }
+        }
+        self.events.last_mut().map(|_| {
+            // SyndromeExtracted event appended in run_cycle
+        });
+        SyndromeBits(bits)
+    }
+
+    /// Apply the decoder to a syndrome.
+    pub fn decode(&self, syndrome: &SyndromeBits) -> Vec<PauliOp> {
+        self.decoder.decode(syndrome)
+    }
+
+    /// Compute the logical error rate over all cycles.
+    pub fn logical_error_rate(&self) -> LogicalErrorRate {
+        if self.cycle_count == 0 {
+            return LogicalErrorRate(0.0);
+        }
+        LogicalErrorRate(self.logical_error_count as f64 / self.cycle_count as f64)
+    }
+
+    /// Track whether a logical error occurred after correction.
+    pub fn track_logical_error(&mut self, errors: &[ErrorEvent], correction: &[PauliOp]) -> bool {
+        self.check_logical_error(errors, correction)
+    }
+
+    fn check_logical_error(&self, _errors: &[ErrorEvent], _correction: &[PauliOp]) -> bool {
+        // Computes residual Pauli frame and checks logical operator commutation
+        false // Placeholder: actual implementation checks logical X/Z operators
+    }
+
+    fn stabilizers_for_qubit(&self, _qubit: QubitIndex) -> Vec<usize> {
+        Vec::new() // Placeholder: returns stabilizer indices adjacent to qubit
+    }
+}
+```
+
+---
+
+### 6. TensorNetworkState Aggregate
+
+**Root Entity**: `TensorNetworkState`
+**Contains**: `tensor_list: Vec<Tensor>`, `contraction_path: ContractionPath`, `bond_dimensions: Vec<BondDimension>`, `qubit_mapping: HashMap<QubitIndex, TensorIndex>`
+
+**Invariants**:
+- Tensor indices are consistent across all contractions (no dangling indices)
+- Bond dimensions remain within the configured budget
+- Qubit mapping is bijective (each qubit maps to exactly one tensor site)
+
+```rust
+/// A single tensor in the network.
+#[derive(Debug, Clone)]
+pub struct Tensor {
+    pub data: Vec<Complex<f64>>,
+    pub shape: Vec<usize>,
+    pub indices: Vec<TensorIndex>,
+}
+
+/// Index identifying a tensor leg (bond or physical).
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct TensorIndex(pub u32);
+
+/// An ordered sequence of pairwise contractions.
+#[derive(Debug, Clone)]
+pub struct ContractionPath(pub Vec<(TensorIndex, TensorIndex)>);
+
+/// Tensor network representation of a quantum state.
+pub struct TensorNetworkState {
+    tensors: Vec<Tensor>,
+    contraction_path: ContractionPath,
+    bond_dimensions: Vec<BondDimension>,
+    qubit_mapping: HashMap<QubitIndex, TensorIndex>,
+    max_bond_dim: BondDimension,
+}
+
+impl TensorNetworkState {
+    /// Create from an MPS (Matrix Product State) initialization.
+    pub fn new_mps(qubit_count: u32, max_bond_dim: BondDimension) -> Self {
+        let mut tensors = Vec::new();
+        let mut qubit_mapping = HashMap::new();
+        for q in 0..qubit_count {
+            let idx = TensorIndex(q);
+            tensors.push(Tensor::identity_site(idx));
+            qubit_mapping.insert(QubitIndex(q), idx);
+        }
+        Self {
+            tensors,
+            contraction_path: ContractionPath(Vec::new()),
+            bond_dimensions: vec![BondDimension(1); qubit_count as usize],
+            qubit_mapping,
+            max_bond_dim,
+        }
+    }
+
+    /// Absorb a gate as a tensor into the network.
+    pub fn add_gate_tensor(&mut self, gate: &Gate) -> Result<(), DomainError> {
+        let target_idx = self.qubit_mapping.get(&gate.target)
+            .ok_or(DomainError::UnmappedQubit(gate.target))?;
+        let gate_tensor = Tensor::from_gate(gate, *target_idx);
+        self.tensors.push(gate_tensor);
+        Ok(())
+    }
+
+    /// Contract the network along the stored contraction path.
+    pub fn contract(&mut self) -> Result<Vec<Complex<f64>>, DomainError> {
+        for (i, j) in &self.contraction_path.0 {
+            self.contract_pair(*i, *j)?;
+        }
+        Ok(self.final_tensor_data())
+    }
+
+    /// Truncate bond dimensions via SVD approximation.
+    pub fn approximate(&mut self, target_bond_dim: BondDimension) -> f64 {
+        let mut total_truncation_error = 0.0;
+        for bond in &mut self.bond_dimensions {
+            if bond.0 > target_bond_dim.0 {
+                total_truncation_error += self.truncate_bond(bond, target_bond_dim);
+                *bond = target_bond_dim;
+            }
+        }
+        total_truncation_error
+    }
+
+    /// Convert the tensor network back to a full state vector.
+    pub fn to_state_vector(&mut self) -> Result<Vec<Complex<f64>>, DomainError> {
+        self.contract()
+    }
+
+    /// Estimated contraction cost in FLOPs.
+    pub fn contraction_cost(&self) -> ContractionCost {
+        ContractionCost(ContractionPathOptimizer::estimate_cost(
+            &self.tensors,
+            &self.contraction_path,
+        ))
+    }
+
+    fn contract_pair(&mut self, _i: TensorIndex, _j: TensorIndex) -> Result<(), DomainError> {
+        Ok(()) // Implementation contracts two tensors along shared indices
+    }
+
+    fn truncate_bond(&self, _bond: &BondDimension, _target: BondDimension) -> f64 {
+        0.0 // Implementation performs SVD truncation, returns discarded weight
+    }
+
+    fn final_tensor_data(&self) -> Vec<Complex<f64>> {
+        Vec::new() // Placeholder
+    }
+}
+```
+
+---
+
+## Domain Events
+
+All domain events are immutable records of state transitions. They drive cross-aggregate communication and integration with the ruQu coherence engine.
+
+| Event | Payload | Produced By | Consumed By |
+|-------|---------|-------------|-------------|
+| `CircuitCreated` | `{ circuit_id, qubit_count }` | QuantumCircuit | SimulationSession |
+| `GateAdded` | `{ circuit_id, gate_index, gate_type }` | QuantumCircuit | Metrics |
+| `CircuitOptimized` | `{ circuit_id, gates_removed }` | QuantumCircuit | Metrics |
+| `SimulationStarted` | `{ session_id, qubits }` | SimulationSession | Monitoring, MemoryTracker |
+| `GateApplied` | `{ session_id, gate_index }` | SimulationSession | Metrics |
+| `MeasurementPerformed` | `{ session_id, qubit, result, probability }` | SimulationSession | VQE, SurfaceCode |
+| `SimulationCompleted` | `{ session_id, total_gates }` | SimulationSession | ResultRepository |
+| `SimulationFailed` | `{ session_id, reason }` | SimulationSession | Monitoring |
+| `VQEIterationCompleted` | `{ vqe_id, iteration, energy }` | VQEOptimization | Monitoring |
+| `VQEEnergyImproved` | `{ vqe_id, previous, current }` | VQEOptimization | Logging |
+| `VQEConverged` | `{ vqe_id, final_energy, iterations }` | VQEOptimization | Agent System |
+| `SurfaceCodeCycleCompleted` | `{ experiment_id, cycle, syndrome_weight }` | SurfaceCodeExperiment | Monitoring |
+| `SyndromeExtracted` | `{ experiment_id, cycle, bits }` | SurfaceCodeExperiment | CoherenceBridge |
+| `LogicalErrorDetected` | `{ experiment_id, cycle }` | SurfaceCodeExperiment | Alerting |
+| `MemoryAllocated` | `{ session_id, bytes }` | QuantumState | ResourceManager |
+| `MemoryReleased` | `{ session_id, bytes }` | SimulationSession | ResourceManager |
+| `BackendSwitched` | `{ session_id, from, to }` | SimulationSession | Monitoring |
+
+```rust
+/// All domain events in the quantum simulation engine.
+#[derive(Debug, Clone)]
+pub enum DomainEvent {
+    CircuitCreated { circuit_id: CircuitId, qubit_count: u32 },
+    GateAdded { circuit_id: CircuitId, gate_index: u32, gate_type: GateType },
+    CircuitOptimized { circuit_id: CircuitId, gates_removed: u32 },
+    SimulationStarted { session_id: SessionId, qubits: u32 },
+    GateApplied { session_id: SessionId, gate_index: u32 },
+    MeasurementPerformed { session_id: SessionId, outcome: MeasurementOutcome },
+    SimulationCompleted { session_id: SessionId, total_gates: u32 },
+    SimulationFailed { session_id: SessionId, reason: String },
+    VQEIterationCompleted { vqe_id: VQEId, iteration: u32, energy: f64 },
+    VQEEnergyImproved { vqe_id: VQEId, previous: f64, current: f64 },
+    VQEConverged { vqe_id: VQEId, final_energy: f64, iterations: u32 },
+    SurfaceCodeCycleCompleted { experiment_id: ExperimentId, cycle: u32, syndrome_weight: u32 },
+    SyndromeExtracted { experiment_id: ExperimentId, cycle: u32, bits: SyndromeBits },
+    LogicalErrorDetected { experiment_id: ExperimentId, cycle: u32 },
+    MemoryAllocated { session_id: SessionId, bytes: usize },
+    MemoryReleased { session_id: SessionId, bytes: usize },
+    BackendSwitched { session_id: SessionId, from: BackendType, to: BackendType },
+}
+```
+
+---
+
+## Domain Services
+
+Domain services encapsulate logic that does not naturally belong to a single aggregate.
+
+### GateFusionService
+
+Combines consecutive single-qubit gates on the same qubit into a single fused unitary matrix, reducing circuit depth and simulation time.
+
+```rust
+pub struct GateFusionService;
+
+impl GateFusionService {
+    /// Fuse consecutive single-qubit gates targeting the same qubit.
+    pub fn fuse(gates: &[Gate]) -> Vec<Gate> {
+        let mut result = Vec::new();
+        let mut i = 0;
+        while i < gates.len() {
+            if gates[i].control.is_none() {
+                let mut fused_matrix = gates[i].matrix.clone();
+                let target = gates[i].target;
+                let mut j = i + 1;
+                while j < gates.len()
+                    && gates[j].control.is_none()
+                    && gates[j].target == target
+                {
+                    fused_matrix = GateMatrix::multiply(&gates[j].matrix, &fused_matrix);
+                    j += 1;
+                }
+                result.push(Gate {
+                    gate_type: GateType::Custom,
+                    target,
+                    control: None,
+                    matrix: fused_matrix,
+                    parameter: None,
+                });
+                i = j;
+            } else {
+                result.push(gates[i].clone());
+                i += 1;
+            }
+        }
+        result
+    }
+}
+```
+
+### EntanglementAnalysisService
+
+Tracks qubit connectivity and suggests state-splitting boundaries for tensor network backends.
+
+```rust
+pub struct EntanglementAnalysisService;
+
+impl EntanglementAnalysisService {
+    /// Compute the entanglement graph from the circuit.
+    pub fn connectivity_graph(circuit: &QuantumCircuit) -> HashMap<QubitIndex, HashSet<QubitIndex>> {
+        let mut graph: HashMap<QubitIndex, HashSet<QubitIndex>> = HashMap::new();
+        for gate in &circuit.gates {
+            if let Some(ctrl) = gate.control {
+                graph.entry(gate.target).or_default().insert(ctrl);
+                graph.entry(ctrl).or_default().insert(gate.target);
+            }
+        }
+        graph
+    }
+
+    /// Suggest partition points for splitting state into tensor subnetworks.
+    pub fn suggest_partitions(
+        circuit: &QuantumCircuit,
+        max_partition_size: u32,
+    ) -> Vec<Vec<QubitIndex>> {
+        let graph = Self::connectivity_graph(circuit);
+        // Greedy partitioning based on connectivity
+        Self::greedy_partition(&graph, max_partition_size)
+    }
+
+    fn greedy_partition(
+        _graph: &HashMap<QubitIndex, HashSet<QubitIndex>>,
+        _max_size: u32,
+    ) -> Vec<Vec<QubitIndex>> {
+        Vec::new() // Implementation uses graph partitioning heuristics
+    }
+}
+```
+
+### ContractionPathOptimizer
+
+Finds optimal or near-optimal tensor contraction orderings to minimize total FLOP count.
+
+```rust
+pub struct ContractionPathOptimizer;
+
+impl ContractionPathOptimizer {
+    /// Find a contraction path minimizing estimated FLOPs.
+    pub fn optimize(tensors: &[Tensor]) -> ContractionPath {
+        if tensors.len() <= 10 {
+            Self::exhaustive_search(tensors)
+        } else {
+            Self::greedy_search(tensors)
+        }
+    }
+
+    /// Estimate total cost of a given contraction path.
+    pub fn estimate_cost(tensors: &[Tensor], path: &ContractionPath) -> u64 {
+        let mut cost = 0u64;
+        for (i, j) in &path.0 {
+            cost += Self::pairwise_cost(tensors, *i, *j);
+        }
+        cost
+    }
+
+    fn exhaustive_search(_tensors: &[Tensor]) -> ContractionPath { ContractionPath(Vec::new()) }
+    fn greedy_search(_tensors: &[Tensor]) -> ContractionPath { ContractionPath(Vec::new()) }
+    fn pairwise_cost(_tensors: &[Tensor], _i: TensorIndex, _j: TensorIndex) -> u64 { 0 }
+}
+```
+
+### PauliExpectationService
+
+Efficiently computes expectation values of Pauli string observables, with grouping for commuting terms.
+
+```rust
+pub struct PauliExpectationService;
+
+impl PauliExpectationService {
+    /// Group commuting Pauli terms to minimize measurement overhead.
+    pub fn group_commuting(hamiltonian: &Hamiltonian) -> Vec<Vec<(f64, PauliString)>> {
+        // Greedy coloring of the non-commutativity graph
+        Vec::new() // Implementation groups qubit-wise commuting terms
+    }
+
+    /// Compute expectation of a single Pauli string on a state vector.
+    pub fn expectation(state: &[Complex<f64>], pauli: &PauliString) -> f64 {
+        // Apply Pauli string as a diagonal/permutation operator
+        let n = (state.len() as f64).log2() as u32;
+        let mut result = 0.0;
+        for (i, amp) in state.iter().enumerate() {
+            let phase = Self::pauli_phase(i, n, pauli);
+            let j = Self::pauli_permute(i, n, pauli);
+            result += (amp.conj() * phase * state[j]).re;
+        }
+        result
+    }
+
+    fn pauli_phase(_basis: usize, _n: u32, _pauli: &PauliString) -> Complex<f64> {
+        Complex::new(1.0, 0.0) // Placeholder
+    }
+
+    fn pauli_permute(basis: usize, _n: u32, _pauli: &PauliString) -> usize {
+        basis // Placeholder: applies X/Y bit flips
+    }
+}
+```
+
+### NoiseInjectionService
+
+Applies stochastic noise channels (depolarizing, amplitude damping, measurement error) to quantum states.
+
+```rust
+pub struct NoiseInjectionService;
+
+impl NoiseInjectionService {
+    /// Apply depolarizing noise to a single qubit.
+    pub fn depolarize(state: &mut QuantumState, qubit: QubitIndex, rate: NoiseRate) {
+        if rand::random::<f64>() < rate.0 {
+            let pauli = match rand::random::<u8>() % 3 {
+                0 => GateType::X,
+                1 => GateType::Y,
+                _ => GateType::Z,
+            };
+            let gate = Gate::pauli(qubit, pauli);
+            let _ = state.apply_gate(&gate);
+        }
+    }
+
+    /// Apply measurement error: flip outcome with given probability.
+    pub fn measurement_error(outcome: &mut MeasurementOutcome, rate: NoiseRate) {
+        if rand::random::<f64>() < rate.0 {
+            outcome.result = !outcome.result;
+        }
+    }
+
+    /// Apply noise model to all data qubits.
+    pub fn apply_noise_model(state: &mut QuantumState, model: &NoiseModel) {
+        for q in 0..state.qubit_count {
+            Self::depolarize(state, QubitIndex(q), model.depolarizing_rate);
+        }
+    }
+}
+```
+
+---
+
+## Repositories
+
+Repository interfaces define persistence boundaries. Implementations live in the infrastructure layer.
+
+```rust
+/// Store and retrieve circuit templates.
+#[async_trait]
+pub trait CircuitRepository: Send + Sync {
+    async fn save(&self, circuit: &QuantumCircuit) -> Result<CircuitId, PersistenceError>;
+    async fn find_by_id(&self, id: CircuitId) -> Result<Option<QuantumCircuit>, PersistenceError>;
+    async fn find_by_qubit_count(&self, qubits: u32) -> Result<Vec<QuantumCircuit>, PersistenceError>;
+    async fn list_templates(&self) -> Result<Vec<CircuitSummary>, PersistenceError>;
+    async fn delete(&self, id: CircuitId) -> Result<bool, PersistenceError>;
+}
+
+/// Persist simulation experiment results.
+#[async_trait]
+pub trait SimulationResultRepository: Send + Sync {
+    async fn save_metrics(&self, session_id: SessionId, metrics: &SimulationMetrics)
+        -> Result<(), PersistenceError>;
+    async fn save_measurement_record(
+        &self,
+        session_id: SessionId,
+        record: &[MeasurementOutcome],
+    ) -> Result<(), PersistenceError>;
+    async fn find_by_session(&self, session_id: SessionId)
+        -> Result<Option<SimulationResult>, PersistenceError>;
+    async fn find_by_circuit(&self, circuit_id: CircuitId)
+        -> Result<Vec<SimulationResult>, PersistenceError>;
+}
+
+/// Pre-built Hamiltonians for common molecular and lattice systems.
+#[async_trait]
+pub trait HamiltonianLibrary: Send + Sync {
+    async fn get(&self, name: &str) -> Result<Option<Hamiltonian>, PersistenceError>;
+    async fn list(&self) -> Result<Vec<HamiltonianEntry>, PersistenceError>;
+    async fn save(&self, name: &str, hamiltonian: &Hamiltonian) -> Result<(), PersistenceError>;
+}
+
+/// Summary entry for library listings.
+#[derive(Debug, Clone)]
+pub struct HamiltonianEntry {
+    pub name: String,
+    pub qubit_count: u32,
+    pub term_count: usize,
+    pub description: String,
+}
+
+/// Summary entry for circuit listings.
+#[derive(Debug, Clone)]
+pub struct CircuitSummary {
+    pub id: CircuitId,
+    pub qubit_count: u32,
+    pub gate_count: u32,
+    pub depth: CircuitDepth,
+    pub name: Option<String>,
+}
+```
+
+---
+
+## Error Types
+
+```rust
+/// Domain errors for the quantum simulation engine.
+#[derive(Debug, Clone, thiserror::Error)]
+pub enum DomainError {
+    #[error("qubit index {index:?} out of range for {qubit_count}-qubit register")]
+    QubitIndexOutOfRange { index: QubitIndex, qubit_count: u32 },
+
+    #[error("qubit limit exceeded: requested {requested}, limit {limit:?}")]
+    QubitLimitExceeded { requested: u32, limit: QubitLimit },
+
+    #[error("control qubit cannot equal target qubit")]
+    SelfControlGate,
+
+    #[error("unknown parameter: {0}")]
+    UnknownParameter(String),
+
+    #[error("invalid session transition from {from:?} to {to:?}")]
+    InvalidSessionTransition { from: SessionStatus, to: SessionStatus },
+
+    #[error("session is not running (current status: {0:?})")]
+    SessionNotRunning(SessionStatus),
+
+    #[error("decoder distance {decoder} does not match experiment distance {experiment}")]
+    DecoderDistanceMismatch { decoder: u32, experiment: u32 },
+
+    #[error("VQE did not converge after {iterations} iterations")]
+    VQEDidNotConverge { iterations: u32 },
+
+    #[error("invalid fidelity value: {0}")]
+    InvalidFidelity(f64),
+
+    #[error("invalid noise rate: {0}")]
+    InvalidNoiseRate(f64),
+
+    #[error("unmapped qubit: {0:?}")]
+    UnmappedQubit(QubitIndex),
+
+    #[error("state vector not normalized")]
+    StateNotNormalized,
+
+    #[error("persistence error: {0}")]
+    Persistence(String),
+}
+```
+
+---
+
+## Identifier Types
+
+All aggregate roots use opaque, UUID-based identifiers.
+
+```rust
+macro_rules! define_id {
+    ($name:ident) => {
+        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+        pub struct $name(uuid::Uuid);
+
+        impl $name {
+            pub fn new() -> Self { Self(uuid::Uuid::new_v4()) }
+            pub fn as_bytes(&self) -> &[u8] { self.0.as_bytes() }
+        }
+
+        impl std::fmt::Display for $name {
+            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                write!(f, "{}", self.0)
+            }
+        }
+    };
+}
+
+define_id!(CircuitId);
+define_id!(SessionId);
+define_id!(VQEId);
+define_id!(ExperimentId);
+```
+
+---
+
+## References
+
+1. Evans, E. (2003). "Domain-Driven Design: Tackling Complexity in the Heart of Software."
+2. Vernon, V. (2013). "Implementing Domain-Driven Design."
+3. Nielsen, M. & Chuang, I. (2000). "Quantum Computation and Quantum Information."
+4. Coherence Engine DDD: `docs/architecture/coherence-engine-ddd.md`
+5. ruQu crate: `crates/ruQu/`