From 5e06d0575326ee600af01f24dc237c765a535fd1 Mon Sep 17 00:00:00 2001
From: Vorflux AI <noreply@vorflux.com>
Date: Mon, 23 Mar 2026 22:12:16 +0000
Subject: [PATCH] Add MemScore documentation to memorybench docs

- New memorybench/memscore.mdx page explaining the composite metric,
  token counting, where it appears (CLI, web UI, report JSON), and
  how to use it for provider comparisons
- Add MemScore to docs.json navigation under Reference group
- Update quickstart with MemScore sample output and link
- Update architecture page Report phase description
- Add MemScore card to overview page
---
 apps/docs/docs.json                    |   6 +-
 apps/docs/memorybench/architecture.mdx |   2 +-
 apps/docs/memorybench/memscore.mdx     | 120 +++++++++++++++++++++++++
 apps/docs/memorybench/overview.mdx     |   4 +
 apps/docs/memorybench/quickstart.mdx   |  33 +++----
 5 files changed, 148 insertions(+), 17 deletions(-)
 create mode 100644 apps/docs/memorybench/memscore.mdx
diff --git a/apps/docs/docs.json b/apps/docs/docs.json
index bc43aacd..fec76cf5 100644
--- a/apps/docs/docs.json
+++ b/apps/docs/docs.json
@@ -216,7 +216,11 @@
 							},
 							{
 								"group": "Reference",
-								"pages": ["memorybench/cli", "memorybench/integrations"]
+								"pages": [
+									"memorybench/memscore",
+									"memorybench/cli",
+									"memorybench/integrations"
+								]
 							}
 						]
 					}
diff --git a/apps/docs/memorybench/architecture.mdx b/apps/docs/memorybench/architecture.mdx
index 5d087817..6a37df6a 100644
--- a/apps/docs/memorybench/architecture.mdx
+++ b/apps/docs/memorybench/architecture.mdx
@@ -65,7 +65,7 @@ flowchart LR
 | **Search** | Query provider → Retrieve context |
 | **Answer** | Build prompt → Generate answer via LLM |
 | **Evaluate** | Compare to ground truth → Score via judge |
-| **Report** | Aggregate scores → Output accuracy + latency |
+| **Report** | Aggregate scores → Output accuracy, latency, token metrics, and [MemScore](/memorybench/memscore) |
 
 Each phase checkpoints independently. Failed runs resume from last successful point.
 
diff --git a/apps/docs/memorybench/memscore.mdx b/apps/docs/memorybench/memscore.mdx
new file mode 100644
index 00000000..f94f2062
--- /dev/null
+++ b/apps/docs/memorybench/memscore.mdx
@@ -0,0 +1,120 @@
+---
+title: "MemScore"
+description: "A composite metric for comparing memory providers across quality, latency, and token efficiency"
+---
+
+## What is MemScore?
+
+MemScore is a composite metric that captures three dimensions of memory provider performance in a single line:
+
+```
+accuracy% / latencyMs / contextTok
+```
+
+For example:
+
+```
+85% / 120ms / 1500tok
+```
+
+This tells you the provider achieved **85% accuracy**, with an average search latency of **120ms**, sending **1,500 tokens** of context to the answering model per question.
+
+## Components
+
+| Component | What it measures | Source |
+|-----------|-----------------|--------|
+| **Quality** | Answer accuracy as a percentage | `(correct / total) * 100` from judge evaluations |
+| **Latency** | Average search response time in milliseconds | Mean of all search phase durations |
+| **Tokens** | Average context tokens sent to the answering model | Client-side token count of retrieved context per question |
+
+<Note>
+MemScore is not a single number — it's a triple. This is intentional. Collapsing quality, latency, and cost into one score hides important tradeoffs. A provider with 90% accuracy at 5,000 tokens is very different from one with 90% accuracy at 500 tokens.
+</Note>
+
+## How token counting works
+
+MemoryBench counts tokens client-side using provider-specific tokenizers:
+
+| Model provider | Tokenizer | Method |
+|----------------|-----------|--------|
+| **OpenAI** | `js-tiktoken` | Exact count using `o200k_base` or `cl100k_base` encoding |
+| **Anthropic** | `@anthropic-ai/tokenizer` | Exact count using Anthropic's tokenizer |
+| **Google** | Approximation | `Math.ceil(text.length / 4)` |
+
+Three token values are tracked per question:
+
+- **`promptTokens`** — Total tokens in the full prompt (instructions + context + question)
+- **`basePromptTokens`** — Tokens in the prompt without any retrieved context
+- **`contextTokens`** — Tokens in just the retrieved context string
+
+The MemScore uses `contextTokens` because it isolates what the memory provider actually contributed.
+
+## Where MemScore appears
+
+### CLI output
+
+After a benchmark run completes, MemScore is printed in the summary:
+
+```
+SUMMARY:
+  Total Questions: 50
+  Correct: 43
+  Accuracy: 86.00%
+
+  Quality:  86%
+  Latency:  145ms (avg)
+  Tokens:   1,823 (avg context sent to answering model)
+
+  MemScore: 86% / 145ms / 1823tok
+```
+
+### Web UI
+
+The MemScore card appears at the top of the run overview page. Per-question token counts are shown next to each model answer in both the question list and detail views.
+
+### Report JSON
+
+The `report.json` file includes both a display string and structured components:
+
+```json
+{
+  "memscore": "86% / 145ms / 1823tok",
+  "memscoreComponents": {
+    "quality": 86,
+    "latencyMs": 145,
+    "contextTokens": 1823
+  },
+  "tokens": {
+    "totalTokens": 142500,
+    "basePromptTokens": 21000,
+    "contextTokens": 91150,
+    "avgTokensPerQuestion": 2850,
+    "avgBasePromptTokens": 420,
+    "avgContextTokens": 1823
+  }
+}
+```
+
+Use `memscoreComponents` for programmatic comparisons — it avoids parsing the display string.
+
+## Comparing providers
+
+MemScore is most useful when comparing providers on the same benchmark:
+
+```bash
+bun run src/index.ts compare -p supermemory,mem0,zep -b locomo -j gpt-4o
+```
+
+Each provider's report will include its own MemScore, making it easy to see tradeoffs at a glance:
+
+| Provider | MemScore |
+|----------|----------|
+| Provider A | `88% / 145ms / 1200tok` |
+| Provider B | `82% / 80ms / 2400tok` |
+| Provider C | `85% / 110ms / 1800tok` |
+
+In this example, Provider A has the highest accuracy but the slowest search. Provider B is the fastest but sends the most context without achieving the best accuracy — suggesting its retrieval may be less precise. Provider C lands in the middle on all three axes. There's no single "winner" — the right choice depends on whether you prioritize quality, speed, or token efficiency.
+
+## Backward compatibility
+
+Runs from before MemScore was added will still work. If token data is not present in the checkpoint, the `memscore`, `memscoreComponents`, and `tokens` fields will be `undefined` in the report. The CLI and web UI gracefully skip the MemScore display when data is unavailable.
diff --git a/apps/docs/memorybench/overview.mdx b/apps/docs/memorybench/overview.mdx
index 14e01466..0e0bd11e 100644
--- a/apps/docs/memorybench/overview.mdx
+++ b/apps/docs/memorybench/overview.mdx
@@ -24,6 +24,10 @@ Our goal is to make evaluation more rigorous, accessible, and in line with indus
   <Card title="Architecture" icon="blocks" href="/memorybench/architecture">
       Understanding MemoryBench's design and implementation
   </Card>
+
+  <Card title="MemScore" icon="gauge" href="/memorybench/memscore">
+      Composite metric for comparing quality, latency, and token efficiency
+  </Card>
 </Columns>
 
 ## Works with any memory provider
diff --git a/apps/docs/memorybench/quickstart.mdx b/apps/docs/memorybench/quickstart.mdx
index e52094a9..645117fd 100644
--- a/apps/docs/memorybench/quickstart.mdx
+++ b/apps/docs/memorybench/quickstart.mdx
@@ -38,24 +38,27 @@ Run the same benchmark across multiple providers:
 bun run src/index.ts compare -p supermemory,mem0,zep -b locomo -j gpt-4o
 ```
 
-Results are saved to `data/runs/{runId}/report.json`.
-
 ## Sample Output
 
-```json
-{
-  "accuracy": 0.72,
-  "accuracyByType": {
-    "single-hop": 0.85,
-    "multi-hop": 0.65,
-    "temporal": 0.70,
-    "adversarial": 0.68
-  },
-  "avgLatency": 1250,
-  "totalQuestions": 50
-}
+Each run produces a [MemScore](/memorybench/memscore) — a composite metric capturing quality, latency, and token efficiency:
+
 ```
+SUMMARY:
+  Total Questions: 50
+  Correct: 36
+  Accuracy: 72.00%
+
+  Quality:  72%
+  Latency:  1250ms (avg)
+  Tokens:   1,823 (avg context sent to answering model)
+
+  MemScore: 72% / 1250ms / 1823tok
+```
+
+Full results are saved to `data/runs/{runId}/report.json` with detailed breakdowns by question type, latency percentiles, and per-question token counts.
 
 ## What's Next
 
-Head to [CLI Reference](/memorybench/cli) to play around with all the commands, or check out [Architecture](/memorybench/architecture) to understand how MemoryBench works under the hood.
+- [MemScore](/memorybench/memscore) — understand the composite metric and how to compare providers
+- [CLI Reference](/memorybench/cli) — all available commands
+- [Architecture](/memorybench/architecture) — how MemoryBench works under the hood