mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-23 21:25:02 +00:00
fix(ruvector): CLI works on fresh DBs via meta sidecar (#417)
Six CLI commands crashed on every fresh database produced by
`ruvector create`:
$ ruvector create /tmp/x.db -d 384
$ ruvector insert /tmp/x.db /tmp/v.json
SyntaxError: Unexpected token 'r', "redb…" is not valid JSON
Root cause: `bin/cli.js` `insert`, `search`, `stats`, `export`, and
`import` all did `JSON.parse(fs.readFileSync(dbPath, 'utf8'))` to
recover the dimension. But `<dbPath>` is a redb (Rust binary) file
managed by `@ruvector/core` — not a JSON document. The first byte
("r") tripped the parser before any other code ran.
Compounding: the same handlers called methods that don't exist on
`VectorDBWrapper` (`db.load`, `db.save`, `db.stats`) and didn't
`await` the async wrapper methods that do exist (`insert`,
`insertBatch`, `search`, `len`).
Fix:
- Persist construction args (dimensions, metric, schema version)
in `<dbPath>.meta.json` from `create`. `insert`/`search`/`stats`
read the sidecar and pass them straight to the wrapper
constructor — no more JSON-parsing of redb bytes.
- Drop calls to the phantom `db.load`/`db.save`/`db.stats` API.
Persistence is automatic via `storagePath`; counting goes through
`await db.len()`.
- Make every CLI handler `async` and `await` the wrapper calls.
Includes `benchmark`, whose previously-dropped promises meant the
reported insert/search rates were just spinner timing.
- Coerce numeric ids to strings inside `insert` (the native binding
rejects integer ids).
- Surface a clear, actionable error when a DB exists without a
sidecar (e.g. created by an older CLI), instead of an opaque
parse failure.
Verified end-to-end with a new test on Node 22.22.2:
$ node test/cli-fresh-db.test.mjs
ok: `ruvector create` exits 0
ok: redb file exists at dbPath
ok: sidecar metadata file exists
ok: sidecar.dimensions = 8
ok: sidecar.metric = cosine
ok: `ruvector insert` exits 0
ok: insert does not crash JSON.parsing the redb binary
ok: `ruvector search` exits 0
ok: search prints `Found N results`
ok: search renders at least one hit row
ok: `ruvector stats` exits 0
ok: stats prints Vector Count
ok: stats fails fast on orphan DB without sidecar
ok: orphan-DB error message mentions sidecar
ruvector fresh-DB CLI smoke OK (issue #417)
Out of scope (deliberately): the `export`/`import` handlers also
called the same phantom API. Those need the wrapper to grow an
enumeration method (`db.entries()` or similar) before they can do
honest work — file-only metadata-export is misleading. Tracked in a
follow-up; the existing handlers are left untouched here.
The ONNX-bundle half of #417 ships in a separate PR (#354).
Closes #417
Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
parent
e383476014
commit
3b00a565eb
2 changed files with 236 additions and 58 deletions
|
|
@ -45,6 +45,64 @@ function requireRuvector() {
|
|||
}
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Database metadata sidecar (#417)
|
||||
// -----------------------------------------------------------------------------
|
||||
// `<dbPath>` is a redb (Rust binary) file managed by @ruvector/core. It is NOT
|
||||
// a JSON document, so the previous implementation that called
|
||||
// `JSON.parse(fs.readFileSync(dbPath))` to recover dimensions crashed
|
||||
// immediately on the redb magic bytes "redb…".
|
||||
//
|
||||
// Instead, every `create` writes `<dbPath>.meta.json` carrying the construction
|
||||
// args (dimensions, metric, schema version). `insert`, `search`, `stats` and
|
||||
// friends read from the sidecar and pass them straight to the wrapper
|
||||
// constructor.
|
||||
// =============================================================================
|
||||
|
||||
const META_SCHEMA_VERSION = 1;
|
||||
|
||||
function metaPathFor(dbPath) {
|
||||
return `${dbPath}.meta.json`;
|
||||
}
|
||||
|
||||
function writeMeta(dbPath, meta) {
|
||||
const payload = {
|
||||
schemaVersion: META_SCHEMA_VERSION,
|
||||
dimensions: meta.dimensions,
|
||||
metric: meta.metric,
|
||||
cliVersion: packageJson.version,
|
||||
createdAt: new Date().toISOString(),
|
||||
};
|
||||
fs.writeFileSync(metaPathFor(dbPath), JSON.stringify(payload, null, 2));
|
||||
}
|
||||
|
||||
function readMeta(dbPath) {
|
||||
const metaPath = metaPathFor(dbPath);
|
||||
if (!fs.existsSync(metaPath)) {
|
||||
if (!fs.existsSync(dbPath)) {
|
||||
throw new Error(
|
||||
`Database not found: ${dbPath}\n` +
|
||||
` Run "ruvector create ${dbPath}" first.`,
|
||||
);
|
||||
}
|
||||
throw new Error(
|
||||
`Database metadata sidecar not found: ${metaPath}\n` +
|
||||
` This database was created without a sidecar (e.g. before #417 was fixed).\n` +
|
||||
` Recreate it with "ruvector create ${dbPath} -d <dimensions> -m <metric>".`,
|
||||
);
|
||||
}
|
||||
let parsed;
|
||||
try {
|
||||
parsed = JSON.parse(fs.readFileSync(metaPath, 'utf8'));
|
||||
} catch (e) {
|
||||
throw new Error(`Invalid sidecar at ${metaPath}: ${e.message}`);
|
||||
}
|
||||
if (typeof parsed.dimensions !== 'number' || parsed.dimensions <= 0) {
|
||||
throw new Error(`Invalid sidecar at ${metaPath}: missing or invalid dimensions`);
|
||||
}
|
||||
return parsed;
|
||||
}
|
||||
|
||||
// Lazy load GNN (optional - loaded on first use, not at startup)
|
||||
// Saves ~6ms startup time by deferring require('@ruvector/gnn')
|
||||
let _gnnModule = undefined; // undefined = not yet attempted, null = failed, object = loaded
|
||||
|
|
@ -157,16 +215,25 @@ program
|
|||
const spinner = ora('Creating database...').start();
|
||||
|
||||
try {
|
||||
const dimension = parseInt(options.dimension);
|
||||
const db = new VectorDB({
|
||||
dimensions: dimension,
|
||||
const dimensions = parseInt(options.dimension);
|
||||
// Construct the redb-backed DB; this creates the file at `dbPath`.
|
||||
// Persistence is automatic via `storagePath` — there is no
|
||||
// separate save() call.
|
||||
// eslint-disable-next-line no-new
|
||||
new VectorDB({
|
||||
dimensions,
|
||||
metric: options.metric,
|
||||
storagePath: dbPath,
|
||||
});
|
||||
|
||||
// Persist the construction args so subsequent commands can recover
|
||||
// them without trying to JSON.parse() the redb binary (#417).
|
||||
writeMeta(dbPath, { dimensions, metric: options.metric });
|
||||
|
||||
spinner.succeed(chalk.green(`Database created: ${dbPath}`));
|
||||
console.log(chalk.gray(` Dimension: ${dimension}`));
|
||||
console.log(chalk.gray(` Dimension: ${dimensions}`));
|
||||
console.log(chalk.gray(` Metric: ${options.metric}`));
|
||||
console.log(chalk.gray(` Sidecar: ${metaPathFor(dbPath)}`));
|
||||
console.log(chalk.gray(` Implementation: ${getImplementationType()}`));
|
||||
} catch (error) {
|
||||
spinner.fail(chalk.red('Failed to create database'));
|
||||
|
|
@ -180,43 +247,39 @@ program
|
|||
.command('insert <database> <file>')
|
||||
.description('Insert vectors from JSON file')
|
||||
.option('-b, --batch-size <number>', 'Batch size for insertion', '1000')
|
||||
.action((dbPath, file, options) => {
|
||||
.action(async (dbPath, file, options) => {
|
||||
requireRuvector();
|
||||
const spinner = ora('Loading database...').start();
|
||||
|
||||
try {
|
||||
// Read database metadata to get dimension
|
||||
let dimension = 384; // default
|
||||
if (fs.existsSync(dbPath)) {
|
||||
const dbData = fs.readFileSync(dbPath, 'utf8');
|
||||
const parsed = JSON.parse(dbData);
|
||||
dimension = parsed.dimension || 384;
|
||||
}
|
||||
|
||||
const db = new VectorDB({ dimension });
|
||||
|
||||
if (fs.existsSync(dbPath)) {
|
||||
db.load(dbPath);
|
||||
}
|
||||
const meta = readMeta(dbPath);
|
||||
const db = new VectorDB({
|
||||
dimensions: meta.dimensions,
|
||||
metric: meta.metric,
|
||||
storagePath: dbPath,
|
||||
});
|
||||
|
||||
spinner.text = 'Reading vectors...';
|
||||
const data = JSON.parse(fs.readFileSync(file, 'utf8'));
|
||||
const vectors = Array.isArray(data) ? data : [data];
|
||||
|
||||
// Coerce integer ids to strings — the native binding requires string ids.
|
||||
for (const v of vectors) {
|
||||
if (typeof v.id === 'number') v.id = String(v.id);
|
||||
}
|
||||
|
||||
spinner.text = `Inserting ${vectors.length} vectors...`;
|
||||
const batchSize = parseInt(options.batchSize);
|
||||
|
||||
for (let i = 0; i < vectors.length; i += batchSize) {
|
||||
const batch = vectors.slice(i, i + batchSize);
|
||||
db.insertBatch(batch);
|
||||
await db.insertBatch(batch);
|
||||
spinner.text = `Inserted ${Math.min(i + batchSize, vectors.length)}/${vectors.length} vectors...`;
|
||||
}
|
||||
|
||||
db.save(dbPath);
|
||||
const total = await db.len();
|
||||
spinner.succeed(chalk.green(`Inserted ${vectors.length} vectors`));
|
||||
|
||||
const stats = db.stats();
|
||||
console.log(chalk.gray(` Total vectors: ${stats.count}`));
|
||||
console.log(chalk.gray(` Total vectors: ${total}`));
|
||||
} catch (error) {
|
||||
spinner.fail(chalk.red('Failed to insert vectors'));
|
||||
console.error(chalk.red(error.message));
|
||||
|
|
@ -232,18 +295,17 @@ program
|
|||
.option('-k, --top-k <number>', 'Number of results', '10')
|
||||
.option('-t, --threshold <number>', 'Similarity threshold', '0.0')
|
||||
.option('-f, --filter <json>', 'Metadata filter as JSON')
|
||||
.action((dbPath, options) => {
|
||||
.action(async (dbPath, options) => {
|
||||
requireRuvector();
|
||||
const spinner = ora('Loading database...').start();
|
||||
|
||||
try {
|
||||
// Read database metadata
|
||||
const dbData = fs.readFileSync(dbPath, 'utf8');
|
||||
const parsed = JSON.parse(dbData);
|
||||
const dimension = parsed.dimension || 384;
|
||||
|
||||
const db = new VectorDB({ dimension });
|
||||
db.load(dbPath);
|
||||
const meta = readMeta(dbPath);
|
||||
const db = new VectorDB({
|
||||
dimensions: meta.dimensions,
|
||||
metric: meta.metric,
|
||||
storagePath: dbPath,
|
||||
});
|
||||
|
||||
spinner.text = 'Searching...';
|
||||
|
||||
|
|
@ -251,18 +313,21 @@ program
|
|||
const query = {
|
||||
vector,
|
||||
k: parseInt(options.topK),
|
||||
threshold: parseFloat(options.threshold)
|
||||
};
|
||||
|
||||
if (options.filter) {
|
||||
query.filter = JSON.parse(options.filter);
|
||||
}
|
||||
|
||||
const results = db.search(query);
|
||||
spinner.succeed(chalk.green(`Found ${results.length} results`));
|
||||
const results = await db.search(query);
|
||||
const threshold = parseFloat(options.threshold);
|
||||
const filtered = threshold > 0
|
||||
? results.filter((r) => r.score >= threshold)
|
||||
: results;
|
||||
spinner.succeed(chalk.green(`Found ${filtered.length} results`));
|
||||
|
||||
console.log(chalk.cyan('\nSearch Results:'));
|
||||
results.forEach((result, i) => {
|
||||
filtered.forEach((result, i) => {
|
||||
console.log(chalk.white(`\n${i + 1}. ID: ${result.id}`));
|
||||
console.log(chalk.yellow(` Score: ${result.score.toFixed(4)}`));
|
||||
if (result.metadata) {
|
||||
|
|
@ -280,35 +345,32 @@ program
|
|||
program
|
||||
.command('stats <database>')
|
||||
.description('Show database statistics')
|
||||
.action((dbPath) => {
|
||||
.action(async (dbPath) => {
|
||||
requireRuvector();
|
||||
const spinner = ora('Loading database...').start();
|
||||
|
||||
try {
|
||||
const dbData = fs.readFileSync(dbPath, 'utf8');
|
||||
const parsed = JSON.parse(dbData);
|
||||
const dimension = parsed.dimension || 384;
|
||||
const meta = readMeta(dbPath);
|
||||
const db = new VectorDB({
|
||||
dimensions: meta.dimensions,
|
||||
metric: meta.metric,
|
||||
storagePath: dbPath,
|
||||
});
|
||||
|
||||
const db = new VectorDB({ dimension });
|
||||
db.load(dbPath);
|
||||
|
||||
const stats = db.stats();
|
||||
const count = await db.len();
|
||||
spinner.succeed(chalk.green('Database statistics'));
|
||||
|
||||
console.log(chalk.cyan('\nDatabase Stats:'));
|
||||
console.log(chalk.white(` Vector Count: ${chalk.yellow(stats.count)}`));
|
||||
console.log(chalk.white(` Dimension: ${chalk.yellow(stats.dimension)}`));
|
||||
console.log(chalk.white(` Metric: ${chalk.yellow(stats.metric)}`));
|
||||
console.log(chalk.white(` Vector Count: ${chalk.yellow(count)}`));
|
||||
console.log(chalk.white(` Dimension: ${chalk.yellow(meta.dimensions)}`));
|
||||
console.log(chalk.white(` Metric: ${chalk.yellow(meta.metric)}`));
|
||||
console.log(chalk.white(` Implementation: ${chalk.yellow(getImplementationType())}`));
|
||||
|
||||
if (stats.memoryUsage) {
|
||||
const mb = (stats.memoryUsage / (1024 * 1024)).toFixed(2);
|
||||
console.log(chalk.white(` Memory Usage: ${chalk.yellow(mb + ' MB')}`));
|
||||
if (fs.existsSync(dbPath)) {
|
||||
const fileStats = fs.statSync(dbPath);
|
||||
const fileMb = (fileStats.size / (1024 * 1024)).toFixed(2);
|
||||
console.log(chalk.white(` File Size: ${chalk.yellow(fileMb + ' MB')}`));
|
||||
}
|
||||
|
||||
const fileStats = fs.statSync(dbPath);
|
||||
const fileMb = (fileStats.size / (1024 * 1024)).toFixed(2);
|
||||
console.log(chalk.white(` File Size: ${chalk.yellow(fileMb + ' MB')}`));
|
||||
} catch (error) {
|
||||
spinner.fail(chalk.red('Failed to load database'));
|
||||
console.error(chalk.red(error.message));
|
||||
|
|
@ -323,7 +385,7 @@ program
|
|||
.option('-d, --dimension <number>', 'Vector dimension', '384')
|
||||
.option('-n, --num-vectors <number>', 'Number of vectors', '10000')
|
||||
.option('-q, --num-queries <number>', 'Number of queries', '1000')
|
||||
.action((options) => {
|
||||
.action(async (options) => {
|
||||
requireRuvector();
|
||||
console.log(chalk.cyan('\nruvector Performance Benchmark'));
|
||||
console.log(chalk.gray(`Implementation: ${getImplementationType()}\n`));
|
||||
|
|
@ -338,7 +400,7 @@ program
|
|||
const db = new VectorDB({ dimensions: dimension, metric: 'cosine' });
|
||||
spinner.succeed();
|
||||
|
||||
// Insert benchmark
|
||||
// Insert benchmark — must await, the wrapper resolves on actual native completion.
|
||||
spinner = ora(`Inserting ${numVectors} vectors...`).start();
|
||||
const insertStart = Date.now();
|
||||
|
||||
|
|
@ -351,14 +413,16 @@ program
|
|||
});
|
||||
}
|
||||
|
||||
db.insertBatch(vectors);
|
||||
await db.insertBatch(vectors);
|
||||
const insertTime = Date.now() - insertStart;
|
||||
const insertRate = (numVectors / (insertTime / 1000)).toFixed(0);
|
||||
|
||||
spinner.succeed(chalk.green(`Inserted ${numVectors} vectors in ${insertTime}ms`));
|
||||
console.log(chalk.gray(` Rate: ${chalk.yellow(insertRate)} vectors/sec`));
|
||||
|
||||
// Search benchmark
|
||||
// Search benchmark — must await each query (#417: previously the
|
||||
// promises were dropped on the floor and the reported rate was just
|
||||
// spinner timing).
|
||||
spinner = ora(`Running ${numQueries} searches...`).start();
|
||||
const searchStart = Date.now();
|
||||
|
||||
|
|
@ -367,7 +431,7 @@ program
|
|||
vector: Array.from({ length: dimension }, () => Math.random()),
|
||||
k: 10
|
||||
};
|
||||
db.search(query);
|
||||
await db.search(query);
|
||||
}
|
||||
|
||||
const searchTime = Date.now() - searchStart;
|
||||
|
|
|
|||
114
npm/packages/ruvector/test/cli-fresh-db.test.mjs
Normal file
114
npm/packages/ruvector/test/cli-fresh-db.test.mjs
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
// End-to-end CLI smoke for issue #417 — verifies that `ruvector create`,
|
||||
// `insert`, `search`, and `stats` work on a fresh database (the old
|
||||
// implementation crashed on every command after `create` because it
|
||||
// JSON.parse()d the redb binary file).
|
||||
//
|
||||
// Run with:
|
||||
//
|
||||
// node test/cli-fresh-db.test.mjs
|
||||
|
||||
import { spawnSync } from 'node:child_process';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import { dirname, join } from 'node:path';
|
||||
import { mkdtempSync, rmSync, writeFileSync, existsSync, readFileSync } from 'node:fs';
|
||||
import { tmpdir } from 'node:os';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const repoRoot = join(__dirname, '..');
|
||||
const cli = join(repoRoot, 'bin/cli.js');
|
||||
const tmp = mkdtempSync(join(tmpdir(), 'ruvector-417-'));
|
||||
const dbPath = join(tmp, 'test.db');
|
||||
|
||||
let failures = 0;
|
||||
function check(cond, msg, extra) {
|
||||
if (!cond) {
|
||||
console.error('FAIL:', msg);
|
||||
if (extra) console.error(' ', extra);
|
||||
failures++;
|
||||
} else {
|
||||
console.log(' ok:', msg);
|
||||
}
|
||||
}
|
||||
|
||||
function runCli(args, env = {}) {
|
||||
return spawnSync(process.execPath, [cli, ...args], {
|
||||
cwd: repoRoot,
|
||||
encoding: 'utf8',
|
||||
env: { ...process.env, ...env },
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
// 1. create — should succeed AND drop a sidecar.
|
||||
let res = runCli(['create', dbPath, '-d', '8', '-m', 'cosine']);
|
||||
check(res.status === 0, '`ruvector create` exits 0', res.stderr || res.stdout);
|
||||
check(existsSync(dbPath), 'redb file exists at dbPath');
|
||||
const sidecar = `${dbPath}.meta.json`;
|
||||
check(existsSync(sidecar), 'sidecar metadata file exists');
|
||||
const meta = JSON.parse(readFileSync(sidecar, 'utf8'));
|
||||
check(meta.dimensions === 8, 'sidecar.dimensions = 8');
|
||||
check(meta.metric === 'cosine', 'sidecar.metric = cosine');
|
||||
|
||||
// 2. insert — should NOT crash with `Unexpected token 'r'` from JSON.parse(redb).
|
||||
const vectorsPath = join(tmp, 'vecs.json');
|
||||
const vectors = [
|
||||
{ id: 'a', vector: [1, 0, 0, 0, 0, 0, 0, 0] },
|
||||
{ id: 'b', vector: [0, 1, 0, 0, 0, 0, 0, 0] },
|
||||
{ id: 'c', vector: [0, 0, 1, 0, 0, 0, 0, 0] },
|
||||
];
|
||||
writeFileSync(vectorsPath, JSON.stringify(vectors));
|
||||
res = runCli(['insert', dbPath, vectorsPath]);
|
||||
check(res.status === 0, '`ruvector insert` exits 0', res.stderr || res.stdout);
|
||||
check(
|
||||
!res.stderr.includes('Unexpected token') && !res.stdout.includes('Unexpected token'),
|
||||
'insert does not crash JSON.parsing the redb binary',
|
||||
res.stderr || res.stdout,
|
||||
);
|
||||
|
||||
// 3. search — should NOT crash, should return at least one hit.
|
||||
res = runCli([
|
||||
'search',
|
||||
dbPath,
|
||||
'-v',
|
||||
JSON.stringify([1, 0, 0, 0, 0, 0, 0, 0]),
|
||||
'-k',
|
||||
'3',
|
||||
]);
|
||||
const searchOut = res.stdout + res.stderr;
|
||||
check(res.status === 0, '`ruvector search` exits 0', res.stderr || res.stdout);
|
||||
check(
|
||||
/Found\s+\d+\s+results?/.test(searchOut),
|
||||
'search prints `Found N results` (across stdout/stderr)',
|
||||
searchOut,
|
||||
);
|
||||
check(
|
||||
res.stdout.includes('ID: a') || res.stdout.includes('ID: b'),
|
||||
'search renders at least one hit row',
|
||||
res.stdout,
|
||||
);
|
||||
|
||||
// 4. stats — should NOT crash, should report Vector Count.
|
||||
res = runCli(['stats', dbPath]);
|
||||
check(res.status === 0, '`ruvector stats` exits 0', res.stderr || res.stdout);
|
||||
check(res.stdout.includes('Vector Count'), 'stats prints Vector Count', res.stdout);
|
||||
|
||||
// 5. helpful error when sidecar is absent (regression guard for the
|
||||
// "user constructs DB without create" path).
|
||||
const orphanDb = join(tmp, 'orphan.db');
|
||||
writeFileSync(orphanDb, 'redb-fake-binary'); // pretend a redb file existed
|
||||
res = runCli(['stats', orphanDb]);
|
||||
check(res.status !== 0, 'stats fails fast on orphan DB without sidecar');
|
||||
check(
|
||||
(res.stderr + res.stdout).includes('sidecar'),
|
||||
'orphan-DB error message mentions sidecar',
|
||||
res.stderr || res.stdout,
|
||||
);
|
||||
} finally {
|
||||
try { rmSync(tmp, { recursive: true, force: true }); } catch {}
|
||||
}
|
||||
|
||||
if (failures > 0) {
|
||||
console.error(`\n${failures} check(s) failed`);
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(`\nruvector fresh-DB CLI smoke OK (issue #417)`);
|
||||
Loading…
Add table
Add a link
Reference in a new issue