ruvector/scripts/lib/rvf-builder.mjs
rUv 930fca916f feat(sse): decouple SSE to mcp.pi.ruv.io proxy + Claude Code source research
SSE Proxy Decoupling (ADR-130):
- Fix ruvbrain-sse proxy: proper MCP handshake, session creation, drain polling
- Fix internal queue endpoints: session_create keeps receiver, drain returns buffered messages
- Add response_queues to AppState for SSE proxy communication
- Skip sparsifier for >5M edge graphs (was crashing on 16M edges)
- Add SSE_DISABLED/MAX_SSE env vars for configurable connection limits
- Route SSE to dedicated mcp.pi.ruv.io subdomain (Cloudflare CNAME)
- Serve SSE at root / path on proxy (no /sse needed)
- Update all references from pi.ruv.io/sse to mcp.pi.ruv.io
- Fix Dockerfile consciousness crate build (feature/version mismatches)

Claude Code CLI Source Research (ADR-133):
- 19 research documents analyzing Claude Code internals (3000+ lines)
- Decompiler script + RVF corpus builder for all major versions
- Binary RVF containers for v0.2, v1.0, v2.0, v2.1 (300-2068 vectors each)
- Call graphs, class hierarchies, state machines from minified source

Integration Strategy (ADR-134):
- 6-tier integration plan: WASM MCP, agents, hooks, cache, SDK, plugin
- Integration guide with architecture diagrams and performance targets

Co-Authored-By: claude-flow <ruv@ruv.net>
2026-04-02 23:39:56 +00:00

259 lines
7.3 KiB
JavaScript
Executable file

#!/usr/bin/env node
/**
* rvf-builder.mjs - Create binary RVF containers from extracted source modules.
*
* Uses the @ruvector/rvf-node native backend to produce real binary .rvf files
* with HNSW-indexed vector embeddings and witness chains.
*
* Each source fragment is embedded as a deterministic vector derived from its
* content hash (a lightweight "fingerprint" embedding). This allows similarity
* search across versions without requiring a full ML embedding model.
*
* Usage:
* node scripts/lib/rvf-builder.mjs <source-dir> <output.rvf> [--meta key=val ...]
*
* source-dir : directory with .js module files + metrics.json
* output.rvf : path for the binary RVF container
* --meta : optional key=value metadata pairs
*/
import { readFileSync, readdirSync, existsSync, writeFileSync } from 'fs';
import { join, basename, resolve } from 'path';
import { createHash } from 'crypto';
// Vector dimension for fingerprint embeddings
const DIMENSIONS = 128;
/**
* Generate a deterministic fingerprint vector from text content.
*
* Uses SHA-256 → expand to DIMENSIONS floats in [-1, 1].
* This is NOT a semantic embedding but a content fingerprint that
* allows exact-match deduplication and change detection across versions.
*/
function fingerprintVector(text) {
const hash = createHash('sha256').update(text).digest();
const vec = new Float32Array(DIMENSIONS);
// Expand 32 bytes of hash into DIMENSIONS floats using a simple
// deterministic expansion: for each float, mix two hash bytes.
for (let i = 0; i < DIMENSIONS; i++) {
const byteA = hash[i % 32];
const byteB = hash[(i * 7 + 13) % 32];
// Map to [-1, 1]
vec[i] = ((byteA * 256 + byteB) / 65535) * 2 - 1;
}
// Normalize to unit length for cosine distance
let norm = 0;
for (let i = 0; i < DIMENSIONS; i++) norm += vec[i] * vec[i];
norm = Math.sqrt(norm);
if (norm > 0) {
for (let i = 0; i < DIMENSIONS; i++) vec[i] /= norm;
}
return vec;
}
/**
* Load the native rvf-node backend.
*/
async function loadRvfNode() {
// Try several possible paths for the native module
const candidates = [
resolve(process.cwd(), 'npm/packages/rvf-node/index.js'),
resolve(process.cwd(), 'node_modules/@ruvector/rvf-node/index.js'),
];
for (const p of candidates) {
if (existsSync(p)) {
const mod = await import(p);
return mod.RvfDatabase ?? mod.default?.RvfDatabase ?? mod;
}
}
throw new Error(
'Could not find @ruvector/rvf-node. Tried:\n ' + candidates.join('\n ')
);
}
/**
* Parse --meta key=value arguments from argv.
*/
function parseMeta(argv) {
const meta = {};
for (let i = 0; i < argv.length; i++) {
if (argv[i] === '--meta' && argv[i + 1]) {
const [k, ...rest] = argv[i + 1].split('=');
meta[k] = rest.join('=');
i++;
}
}
return meta;
}
/**
* Main entry point.
*/
async function main() {
const args = process.argv.slice(2);
const sourceDir = args[0];
const outputRvf = args[1];
if (!sourceDir || !outputRvf) {
console.error(
'Usage: node rvf-builder.mjs <source-dir> <output.rvf> [--meta key=val ...]'
);
process.exit(1);
}
const meta = parseMeta(args.slice(2));
// Load native RVF module
let RvfDatabase;
try {
RvfDatabase = await loadRvfNode();
} catch (err) {
console.error('Failed to load @ruvector/rvf-node:', err.message);
process.exit(1);
}
// Read metrics if available
const metricsPath = join(sourceDir, 'metrics.json');
let metrics = {};
if (existsSync(metricsPath)) {
metrics = JSON.parse(readFileSync(metricsPath, 'utf-8'));
}
// Collect all .js module files
const moduleFiles = readdirSync(sourceDir)
.filter((f) => f.endsWith('.js'))
.sort();
if (moduleFiles.length === 0) {
console.error(`No .js module files found in ${sourceDir}`);
process.exit(1);
}
console.log(
`Building RVF container: ${basename(outputRvf)} (${moduleFiles.length} modules, ${DIMENSIONS}d vectors)`
);
// Create the RVF database
const db = RvfDatabase.create(outputRvf, {
dimension: DIMENSIONS,
metric: 'Cosine',
profile: 0,
compression: 'None',
signing: false,
m: 16,
ef_construction: 200,
});
// Ingest vectors for each module fragment
let totalFragments = 0;
let vectorId = 1;
const idMap = {};
for (const modFile of moduleFiles) {
const modName = basename(modFile, '.js');
const content = readFileSync(join(sourceDir, modFile), 'utf-8');
const fragments = content.split('\n\n').filter((f) => f.trim().length > 10);
if (fragments.length === 0) continue;
// Build a flat vector array and IDs for batch ingest
const vectors = new Float32Array(fragments.length * DIMENSIONS);
const ids = [];
for (let i = 0; i < fragments.length; i++) {
const vec = fingerprintVector(fragments[i]);
vectors.set(vec, i * DIMENSIONS);
ids.push(vectorId);
idMap[vectorId] = {
module: modName,
fragmentIndex: i,
sizeBytes: Buffer.byteLength(fragments[i]),
hash: createHash('sha256').update(fragments[i]).digest('hex').slice(0, 16),
};
vectorId++;
}
const result = db.ingestBatch(vectors, ids);
totalFragments += result.accepted;
console.log(
` ${modName}: ${result.accepted} vectors ingested (${fragments.length} fragments)`
);
}
// Get final status
const status = db.status();
const fileId = db.fileId();
const segments = db.segments();
// Write the ID mapping sidecar (extends the default .idmap.json)
const sidecarPath = outputRvf + '.manifest.json';
const manifest = {
format: 'rvf-binary',
version: '1.0',
fileId,
dimensions: DIMENSIONS,
metric: 'cosine',
totalVectors: status.totalVectors,
totalSegments: status.totalSegments,
fileSizeBytes: status.fileSize,
epoch: status.currentEpoch,
segments: segments.map((s) => ({
id: s.id,
type: s.segType,
offset: s.offset,
payloadLength: s.payloadLength,
})),
source: {
package: meta.package || '@anthropic-ai/claude-code',
version: meta.version || metrics.version || 'unknown',
extractedAt: metrics.extractedAt || new Date().toISOString(),
metrics: {
bundleSizeBytes: metrics.sizeBytes || 0,
classes: metrics.classes || 0,
functions: metrics.functions || 0,
asyncFunctions: metrics.asyncFunctions || 0,
arrowFunctions: metrics.arrowFunctions || 0,
},
},
modules: Object.entries(metrics.modules || {}).map(([name, info]) => ({
name,
...info,
})),
idMap,
meta,
createdAt: new Date().toISOString(),
};
writeFileSync(sidecarPath, JSON.stringify(manifest, null, 2));
db.close();
console.log(`\nRVF container created successfully:`);
console.log(` File: ${outputRvf}`);
console.log(` File ID: ${fileId}`);
console.log(` Vectors: ${totalFragments}`);
console.log(` Segments: ${status.totalSegments}`);
console.log(` Size: ${(status.fileSize / 1024).toFixed(1)} KB`);
console.log(` Manifest: ${sidecarPath}`);
// Output JSON for caller
const result = {
success: true,
path: outputRvf,
fileId,
vectors: totalFragments,
segments: status.totalSegments,
sizeBytes: status.fileSize,
};
console.log(JSON.stringify(result));
}
main().catch((err) => {
console.error('Fatal error:', err);
process.exit(1);
});