fix(ruvector): CLI works on fresh DBs via meta sidecar (#417)

Six CLI commands crashed on every fresh database produced by
`ruvector create`:

    $ ruvector create /tmp/x.db -d 384
    $ ruvector insert /tmp/x.db /tmp/v.json
    SyntaxError: Unexpected token 'r', "redb…" is not valid JSON

Root cause: `bin/cli.js` `insert`, `search`, `stats`, `export`, and
`import` all did `JSON.parse(fs.readFileSync(dbPath, 'utf8'))` to
recover the dimension. But `<dbPath>` is a redb (Rust binary) file
managed by `@ruvector/core` — not a JSON document. The first byte
("r") tripped the parser before any other code ran.

Compounding: the same handlers called methods that don't exist on
`VectorDBWrapper` (`db.load`, `db.save`, `db.stats`) and didn't
`await` the async wrapper methods that do exist (`insert`,
`insertBatch`, `search`, `len`).

Fix:

- Persist construction args (dimensions, metric, schema version)
  in `<dbPath>.meta.json` from `create`. `insert`/`search`/`stats`
  read the sidecar and pass them straight to the wrapper
  constructor — no more JSON-parsing of redb bytes.
- Drop calls to the phantom `db.load`/`db.save`/`db.stats` API.
  Persistence is automatic via `storagePath`; counting goes through
  `await db.len()`.
- Make every CLI handler `async` and `await` the wrapper calls.
  Includes `benchmark`, whose previously-dropped promises meant the
  reported insert/search rates were just spinner timing.
- Coerce numeric ids to strings inside `insert` (the native binding
  rejects integer ids).
- Surface a clear, actionable error when a DB exists without a
  sidecar (e.g. created by an older CLI), instead of an opaque
  parse failure.

Verified end-to-end with a new test on Node 22.22.2:

    $ node test/cli-fresh-db.test.mjs
      ok: `ruvector create` exits 0
      ok: redb file exists at dbPath
      ok: sidecar metadata file exists
      ok: sidecar.dimensions = 8
      ok: sidecar.metric = cosine
      ok: `ruvector insert` exits 0
      ok: insert does not crash JSON.parsing the redb binary
      ok: `ruvector search` exits 0
      ok: search prints `Found N results`
      ok: search renders at least one hit row
      ok: `ruvector stats` exits 0
      ok: stats prints Vector Count
      ok: stats fails fast on orphan DB without sidecar
      ok: orphan-DB error message mentions sidecar

    ruvector fresh-DB CLI smoke OK (issue #417)

Out of scope (deliberately): the `export`/`import` handlers also
called the same phantom API. Those need the wrapper to grow an
enumeration method (`db.entries()` or similar) before they can do
honest work — file-only metadata-export is misleading. Tracked in a
follow-up; the existing handlers are left untouched here.

The ONNX-bundle half of #417 ships in a separate PR (#354).

Closes #417

Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
ruvnet 2026-05-07 15:47:09 -04:00
parent e383476014
commit 3b00a565eb
2 changed files with 236 additions and 58 deletions

View file

@ -45,6 +45,64 @@ function requireRuvector() {
}
}
// =============================================================================
// Database metadata sidecar (#417)
// -----------------------------------------------------------------------------
// `<dbPath>` is a redb (Rust binary) file managed by @ruvector/core. It is NOT
// a JSON document, so the previous implementation that called
// `JSON.parse(fs.readFileSync(dbPath))` to recover dimensions crashed
// immediately on the redb magic bytes "redb…".
//
// Instead, every `create` writes `<dbPath>.meta.json` carrying the construction
// args (dimensions, metric, schema version). `insert`, `search`, `stats` and
// friends read from the sidecar and pass them straight to the wrapper
// constructor.
// =============================================================================
const META_SCHEMA_VERSION = 1;
function metaPathFor(dbPath) {
return `${dbPath}.meta.json`;
}
function writeMeta(dbPath, meta) {
const payload = {
schemaVersion: META_SCHEMA_VERSION,
dimensions: meta.dimensions,
metric: meta.metric,
cliVersion: packageJson.version,
createdAt: new Date().toISOString(),
};
fs.writeFileSync(metaPathFor(dbPath), JSON.stringify(payload, null, 2));
}
function readMeta(dbPath) {
const metaPath = metaPathFor(dbPath);
if (!fs.existsSync(metaPath)) {
if (!fs.existsSync(dbPath)) {
throw new Error(
`Database not found: ${dbPath}\n` +
` Run "ruvector create ${dbPath}" first.`,
);
}
throw new Error(
`Database metadata sidecar not found: ${metaPath}\n` +
` This database was created without a sidecar (e.g. before #417 was fixed).\n` +
` Recreate it with "ruvector create ${dbPath} -d <dimensions> -m <metric>".`,
);
}
let parsed;
try {
parsed = JSON.parse(fs.readFileSync(metaPath, 'utf8'));
} catch (e) {
throw new Error(`Invalid sidecar at ${metaPath}: ${e.message}`);
}
if (typeof parsed.dimensions !== 'number' || parsed.dimensions <= 0) {
throw new Error(`Invalid sidecar at ${metaPath}: missing or invalid dimensions`);
}
return parsed;
}
// Lazy load GNN (optional - loaded on first use, not at startup)
// Saves ~6ms startup time by deferring require('@ruvector/gnn')
let _gnnModule = undefined; // undefined = not yet attempted, null = failed, object = loaded
@ -157,16 +215,25 @@ program
const spinner = ora('Creating database...').start();
try {
const dimension = parseInt(options.dimension);
const db = new VectorDB({
dimensions: dimension,
const dimensions = parseInt(options.dimension);
// Construct the redb-backed DB; this creates the file at `dbPath`.
// Persistence is automatic via `storagePath` — there is no
// separate save() call.
// eslint-disable-next-line no-new
new VectorDB({
dimensions,
metric: options.metric,
storagePath: dbPath,
});
// Persist the construction args so subsequent commands can recover
// them without trying to JSON.parse() the redb binary (#417).
writeMeta(dbPath, { dimensions, metric: options.metric });
spinner.succeed(chalk.green(`Database created: ${dbPath}`));
console.log(chalk.gray(` Dimension: ${dimension}`));
console.log(chalk.gray(` Dimension: ${dimensions}`));
console.log(chalk.gray(` Metric: ${options.metric}`));
console.log(chalk.gray(` Sidecar: ${metaPathFor(dbPath)}`));
console.log(chalk.gray(` Implementation: ${getImplementationType()}`));
} catch (error) {
spinner.fail(chalk.red('Failed to create database'));
@ -180,43 +247,39 @@ program
.command('insert <database> <file>')
.description('Insert vectors from JSON file')
.option('-b, --batch-size <number>', 'Batch size for insertion', '1000')
.action((dbPath, file, options) => {
.action(async (dbPath, file, options) => {
requireRuvector();
const spinner = ora('Loading database...').start();
try {
// Read database metadata to get dimension
let dimension = 384; // default
if (fs.existsSync(dbPath)) {
const dbData = fs.readFileSync(dbPath, 'utf8');
const parsed = JSON.parse(dbData);
dimension = parsed.dimension || 384;
}
const db = new VectorDB({ dimension });
if (fs.existsSync(dbPath)) {
db.load(dbPath);
}
const meta = readMeta(dbPath);
const db = new VectorDB({
dimensions: meta.dimensions,
metric: meta.metric,
storagePath: dbPath,
});
spinner.text = 'Reading vectors...';
const data = JSON.parse(fs.readFileSync(file, 'utf8'));
const vectors = Array.isArray(data) ? data : [data];
// Coerce integer ids to strings — the native binding requires string ids.
for (const v of vectors) {
if (typeof v.id === 'number') v.id = String(v.id);
}
spinner.text = `Inserting ${vectors.length} vectors...`;
const batchSize = parseInt(options.batchSize);
for (let i = 0; i < vectors.length; i += batchSize) {
const batch = vectors.slice(i, i + batchSize);
db.insertBatch(batch);
await db.insertBatch(batch);
spinner.text = `Inserted ${Math.min(i + batchSize, vectors.length)}/${vectors.length} vectors...`;
}
db.save(dbPath);
const total = await db.len();
spinner.succeed(chalk.green(`Inserted ${vectors.length} vectors`));
const stats = db.stats();
console.log(chalk.gray(` Total vectors: ${stats.count}`));
console.log(chalk.gray(` Total vectors: ${total}`));
} catch (error) {
spinner.fail(chalk.red('Failed to insert vectors'));
console.error(chalk.red(error.message));
@ -232,18 +295,17 @@ program
.option('-k, --top-k <number>', 'Number of results', '10')
.option('-t, --threshold <number>', 'Similarity threshold', '0.0')
.option('-f, --filter <json>', 'Metadata filter as JSON')
.action((dbPath, options) => {
.action(async (dbPath, options) => {
requireRuvector();
const spinner = ora('Loading database...').start();
try {
// Read database metadata
const dbData = fs.readFileSync(dbPath, 'utf8');
const parsed = JSON.parse(dbData);
const dimension = parsed.dimension || 384;
const db = new VectorDB({ dimension });
db.load(dbPath);
const meta = readMeta(dbPath);
const db = new VectorDB({
dimensions: meta.dimensions,
metric: meta.metric,
storagePath: dbPath,
});
spinner.text = 'Searching...';
@ -251,18 +313,21 @@ program
const query = {
vector,
k: parseInt(options.topK),
threshold: parseFloat(options.threshold)
};
if (options.filter) {
query.filter = JSON.parse(options.filter);
}
const results = db.search(query);
spinner.succeed(chalk.green(`Found ${results.length} results`));
const results = await db.search(query);
const threshold = parseFloat(options.threshold);
const filtered = threshold > 0
? results.filter((r) => r.score >= threshold)
: results;
spinner.succeed(chalk.green(`Found ${filtered.length} results`));
console.log(chalk.cyan('\nSearch Results:'));
results.forEach((result, i) => {
filtered.forEach((result, i) => {
console.log(chalk.white(`\n${i + 1}. ID: ${result.id}`));
console.log(chalk.yellow(` Score: ${result.score.toFixed(4)}`));
if (result.metadata) {
@ -280,35 +345,32 @@ program
program
.command('stats <database>')
.description('Show database statistics')
.action((dbPath) => {
.action(async (dbPath) => {
requireRuvector();
const spinner = ora('Loading database...').start();
try {
const dbData = fs.readFileSync(dbPath, 'utf8');
const parsed = JSON.parse(dbData);
const dimension = parsed.dimension || 384;
const meta = readMeta(dbPath);
const db = new VectorDB({
dimensions: meta.dimensions,
metric: meta.metric,
storagePath: dbPath,
});
const db = new VectorDB({ dimension });
db.load(dbPath);
const stats = db.stats();
const count = await db.len();
spinner.succeed(chalk.green('Database statistics'));
console.log(chalk.cyan('\nDatabase Stats:'));
console.log(chalk.white(` Vector Count: ${chalk.yellow(stats.count)}`));
console.log(chalk.white(` Dimension: ${chalk.yellow(stats.dimension)}`));
console.log(chalk.white(` Metric: ${chalk.yellow(stats.metric)}`));
console.log(chalk.white(` Vector Count: ${chalk.yellow(count)}`));
console.log(chalk.white(` Dimension: ${chalk.yellow(meta.dimensions)}`));
console.log(chalk.white(` Metric: ${chalk.yellow(meta.metric)}`));
console.log(chalk.white(` Implementation: ${chalk.yellow(getImplementationType())}`));
if (stats.memoryUsage) {
const mb = (stats.memoryUsage / (1024 * 1024)).toFixed(2);
console.log(chalk.white(` Memory Usage: ${chalk.yellow(mb + ' MB')}`));
if (fs.existsSync(dbPath)) {
const fileStats = fs.statSync(dbPath);
const fileMb = (fileStats.size / (1024 * 1024)).toFixed(2);
console.log(chalk.white(` File Size: ${chalk.yellow(fileMb + ' MB')}`));
}
const fileStats = fs.statSync(dbPath);
const fileMb = (fileStats.size / (1024 * 1024)).toFixed(2);
console.log(chalk.white(` File Size: ${chalk.yellow(fileMb + ' MB')}`));
} catch (error) {
spinner.fail(chalk.red('Failed to load database'));
console.error(chalk.red(error.message));
@ -323,7 +385,7 @@ program
.option('-d, --dimension <number>', 'Vector dimension', '384')
.option('-n, --num-vectors <number>', 'Number of vectors', '10000')
.option('-q, --num-queries <number>', 'Number of queries', '1000')
.action((options) => {
.action(async (options) => {
requireRuvector();
console.log(chalk.cyan('\nruvector Performance Benchmark'));
console.log(chalk.gray(`Implementation: ${getImplementationType()}\n`));
@ -338,7 +400,7 @@ program
const db = new VectorDB({ dimensions: dimension, metric: 'cosine' });
spinner.succeed();
// Insert benchmark
// Insert benchmark — must await, the wrapper resolves on actual native completion.
spinner = ora(`Inserting ${numVectors} vectors...`).start();
const insertStart = Date.now();
@ -351,14 +413,16 @@ program
});
}
db.insertBatch(vectors);
await db.insertBatch(vectors);
const insertTime = Date.now() - insertStart;
const insertRate = (numVectors / (insertTime / 1000)).toFixed(0);
spinner.succeed(chalk.green(`Inserted ${numVectors} vectors in ${insertTime}ms`));
console.log(chalk.gray(` Rate: ${chalk.yellow(insertRate)} vectors/sec`));
// Search benchmark
// Search benchmark — must await each query (#417: previously the
// promises were dropped on the floor and the reported rate was just
// spinner timing).
spinner = ora(`Running ${numQueries} searches...`).start();
const searchStart = Date.now();
@ -367,7 +431,7 @@ program
vector: Array.from({ length: dimension }, () => Math.random()),
k: 10
};
db.search(query);
await db.search(query);
}
const searchTime = Date.now() - searchStart;

View file

@ -0,0 +1,114 @@
// End-to-end CLI smoke for issue #417 — verifies that `ruvector create`,
// `insert`, `search`, and `stats` work on a fresh database (the old
// implementation crashed on every command after `create` because it
// JSON.parse()d the redb binary file).
//
// Run with:
//
// node test/cli-fresh-db.test.mjs
import { spawnSync } from 'node:child_process';
import { fileURLToPath } from 'node:url';
import { dirname, join } from 'node:path';
import { mkdtempSync, rmSync, writeFileSync, existsSync, readFileSync } from 'node:fs';
import { tmpdir } from 'node:os';
const __dirname = dirname(fileURLToPath(import.meta.url));
const repoRoot = join(__dirname, '..');
const cli = join(repoRoot, 'bin/cli.js');
const tmp = mkdtempSync(join(tmpdir(), 'ruvector-417-'));
const dbPath = join(tmp, 'test.db');
let failures = 0;
function check(cond, msg, extra) {
if (!cond) {
console.error('FAIL:', msg);
if (extra) console.error(' ', extra);
failures++;
} else {
console.log(' ok:', msg);
}
}
function runCli(args, env = {}) {
return spawnSync(process.execPath, [cli, ...args], {
cwd: repoRoot,
encoding: 'utf8',
env: { ...process.env, ...env },
});
}
try {
// 1. create — should succeed AND drop a sidecar.
let res = runCli(['create', dbPath, '-d', '8', '-m', 'cosine']);
check(res.status === 0, '`ruvector create` exits 0', res.stderr || res.stdout);
check(existsSync(dbPath), 'redb file exists at dbPath');
const sidecar = `${dbPath}.meta.json`;
check(existsSync(sidecar), 'sidecar metadata file exists');
const meta = JSON.parse(readFileSync(sidecar, 'utf8'));
check(meta.dimensions === 8, 'sidecar.dimensions = 8');
check(meta.metric === 'cosine', 'sidecar.metric = cosine');
// 2. insert — should NOT crash with `Unexpected token 'r'` from JSON.parse(redb).
const vectorsPath = join(tmp, 'vecs.json');
const vectors = [
{ id: 'a', vector: [1, 0, 0, 0, 0, 0, 0, 0] },
{ id: 'b', vector: [0, 1, 0, 0, 0, 0, 0, 0] },
{ id: 'c', vector: [0, 0, 1, 0, 0, 0, 0, 0] },
];
writeFileSync(vectorsPath, JSON.stringify(vectors));
res = runCli(['insert', dbPath, vectorsPath]);
check(res.status === 0, '`ruvector insert` exits 0', res.stderr || res.stdout);
check(
!res.stderr.includes('Unexpected token') && !res.stdout.includes('Unexpected token'),
'insert does not crash JSON.parsing the redb binary',
res.stderr || res.stdout,
);
// 3. search — should NOT crash, should return at least one hit.
res = runCli([
'search',
dbPath,
'-v',
JSON.stringify([1, 0, 0, 0, 0, 0, 0, 0]),
'-k',
'3',
]);
const searchOut = res.stdout + res.stderr;
check(res.status === 0, '`ruvector search` exits 0', res.stderr || res.stdout);
check(
/Found\s+\d+\s+results?/.test(searchOut),
'search prints `Found N results` (across stdout/stderr)',
searchOut,
);
check(
res.stdout.includes('ID: a') || res.stdout.includes('ID: b'),
'search renders at least one hit row',
res.stdout,
);
// 4. stats — should NOT crash, should report Vector Count.
res = runCli(['stats', dbPath]);
check(res.status === 0, '`ruvector stats` exits 0', res.stderr || res.stdout);
check(res.stdout.includes('Vector Count'), 'stats prints Vector Count', res.stdout);
// 5. helpful error when sidecar is absent (regression guard for the
// "user constructs DB without create" path).
const orphanDb = join(tmp, 'orphan.db');
writeFileSync(orphanDb, 'redb-fake-binary'); // pretend a redb file existed
res = runCli(['stats', orphanDb]);
check(res.status !== 0, 'stats fails fast on orphan DB without sidecar');
check(
(res.stderr + res.stdout).includes('sidecar'),
'orphan-DB error message mentions sidecar',
res.stderr || res.stdout,
);
} finally {
try { rmSync(tmp, { recursive: true, force: true }); } catch {}
}
if (failures > 0) {
console.error(`\n${failures} check(s) failed`);
process.exit(1);
}
console.log(`\nruvector fresh-DB CLI smoke OK (issue #417)`);