mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-23 21:25:02 +00:00
feat(sse): decouple SSE to mcp.pi.ruv.io proxy + Claude Code source research
SSE Proxy Decoupling (ADR-130): - Fix ruvbrain-sse proxy: proper MCP handshake, session creation, drain polling - Fix internal queue endpoints: session_create keeps receiver, drain returns buffered messages - Add response_queues to AppState for SSE proxy communication - Skip sparsifier for >5M edge graphs (was crashing on 16M edges) - Add SSE_DISABLED/MAX_SSE env vars for configurable connection limits - Route SSE to dedicated mcp.pi.ruv.io subdomain (Cloudflare CNAME) - Serve SSE at root / path on proxy (no /sse needed) - Update all references from pi.ruv.io/sse to mcp.pi.ruv.io - Fix Dockerfile consciousness crate build (feature/version mismatches) Claude Code CLI Source Research (ADR-133): - 19 research documents analyzing Claude Code internals (3000+ lines) - Decompiler script + RVF corpus builder for all major versions - Binary RVF containers for v0.2, v1.0, v2.0, v2.1 (300-2068 vectors each) - Call graphs, class hierarchies, state machines from minified source Integration Strategy (ADR-134): - 6-tier integration plan: WASM MCP, agents, hooks, cache, SDK, plugin - Integration guide with architecture diagrams and performance targets Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
parent
3569b697c1
commit
930fca916f
103 changed files with 50257 additions and 78 deletions
267
scripts/claude-code-decompile.sh
Executable file
267
scripts/claude-code-decompile.sh
Executable file
|
|
@ -0,0 +1,267 @@
|
|||
#!/usr/bin/env bash
|
||||
# claude-code-decompile.sh - Extract and analyze Claude Code CLI source
|
||||
#
|
||||
# Extracts the bundled JavaScript from the Claude Code binary or npm package,
|
||||
# applies basic beautification, and splits into logical modules.
|
||||
#
|
||||
# Usage: ./scripts/claude-code-decompile.sh [output-dir]
|
||||
#
|
||||
# Output directory defaults to ./claude-code-extracted/
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
OUTPUT_DIR="${1:-./claude-code-extracted}"
|
||||
BINARY=""
|
||||
CLI_JS=""
|
||||
|
||||
# Color output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m'
|
||||
|
||||
log() { echo -e "${GREEN}[+]${NC} $*"; }
|
||||
warn() { echo -e "${YELLOW}[!]${NC} $*"; }
|
||||
err() { echo -e "${RED}[-]${NC} $*" >&2; }
|
||||
|
||||
# Find the Claude Code source
|
||||
find_source() {
|
||||
# Method 1: NPM package (preferred - cleaner JS)
|
||||
local npm_paths=(
|
||||
"$(npm root -g 2>/dev/null)/claude-flow/node_modules/@anthropic-ai/claude-code/cli.js"
|
||||
"$(npm root -g 2>/dev/null)/@anthropic-ai/claude-code/cli.js"
|
||||
"./node_modules/@anthropic-ai/claude-code/cli.js"
|
||||
)
|
||||
for p in "${npm_paths[@]}"; do
|
||||
if [[ -f "$p" ]]; then
|
||||
CLI_JS="$p"
|
||||
log "Found NPM package: $CLI_JS"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
|
||||
# Method 2: Bun SEA binary
|
||||
local bin_paths=(
|
||||
"$HOME/.local/bin/claude"
|
||||
"$HOME/.local/share/claude/versions/"
|
||||
"/usr/local/bin/claude"
|
||||
)
|
||||
for p in "${bin_paths[@]}"; do
|
||||
if [[ -f "$p" ]]; then
|
||||
BINARY="$(readlink -f "$p" 2>/dev/null || echo "$p")"
|
||||
log "Found binary: $BINARY"
|
||||
return 0
|
||||
elif [[ -d "$p" ]]; then
|
||||
BINARY="$(ls -t "$p"* 2>/dev/null | head -1)"
|
||||
if [[ -n "$BINARY" ]]; then
|
||||
log "Found binary: $BINARY"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
err "Could not find Claude Code binary or npm package"
|
||||
echo "Install via: npm install -g @anthropic-ai/claude-code"
|
||||
echo "Or ensure claude is installed: claude --version"
|
||||
return 1
|
||||
}
|
||||
|
||||
# Extract JS from Bun SEA binary using strings
|
||||
extract_from_binary() {
|
||||
local binary="$1"
|
||||
local output="$2"
|
||||
|
||||
log "Extracting strings from binary ($(du -h "$binary" | cut -f1))..."
|
||||
strings "$binary" > "${output}/raw-strings.txt"
|
||||
|
||||
local total_lines
|
||||
total_lines=$(wc -l < "${output}/raw-strings.txt")
|
||||
log "Extracted $total_lines string fragments"
|
||||
|
||||
# Extract JS-like patterns
|
||||
log "Filtering JavaScript patterns..."
|
||||
grep -E '(function\s|class\s|=>\s*\{|export\s|import\s|require\(|async\s|await\s|const\s|let\s|var\s)' \
|
||||
"${output}/raw-strings.txt" > "${output}/js-fragments.txt" 2>/dev/null || true
|
||||
|
||||
local js_lines
|
||||
js_lines=$(wc -l < "${output}/js-fragments.txt")
|
||||
log "Found $js_lines JS-like fragments"
|
||||
}
|
||||
|
||||
# Process the cli.js bundle
|
||||
process_bundle() {
|
||||
local source="$1"
|
||||
local output="$2"
|
||||
|
||||
log "Processing bundle: $(du -h "$source" | cut -f1)"
|
||||
|
||||
# Copy original
|
||||
cp "$source" "${output}/cli.js.original"
|
||||
|
||||
# Basic beautification: add newlines at statement boundaries
|
||||
log "Beautifying (adding newlines at statement boundaries)..."
|
||||
sed 's/;/;\n/g' "$source" | \
|
||||
sed 's/{/{\n/g' | \
|
||||
sed 's/}/}\n/g' > "${output}/cli.beautified.js"
|
||||
|
||||
local beautified_lines
|
||||
beautified_lines=$(wc -l < "${output}/cli.beautified.js")
|
||||
log "Beautified: $beautified_lines lines"
|
||||
|
||||
# Extract metrics
|
||||
log "Computing code metrics..."
|
||||
{
|
||||
echo "=== Claude Code Source Metrics ==="
|
||||
echo "Date: $(date -Iseconds)"
|
||||
echo "Source: $source"
|
||||
echo "Original size: $(wc -c < "$source") bytes"
|
||||
echo "Original lines: $(wc -l < "$source")"
|
||||
echo "Beautified lines: $beautified_lines"
|
||||
echo ""
|
||||
echo "--- Counts ---"
|
||||
echo "Functions: $(grep -oP 'function\s*\w*\s*\(' "$source" | wc -l)"
|
||||
echo "Async functions: $(grep -oP 'async\s+function' "$source" | wc -l)"
|
||||
echo "Arrow functions: $(grep -oP '=>' "$source" | wc -l)"
|
||||
echo "Classes: $(grep -oP 'class \w+' "$source" | wc -l)"
|
||||
echo "Extends: $(grep -oP 'extends \w+' "$source" | wc -l)"
|
||||
echo "For-await loops: $(grep -c 'for await' "$source")"
|
||||
echo "Yield statements: $(grep -c 'yield' "$source")"
|
||||
echo ""
|
||||
echo "--- Node.js Imports ---"
|
||||
grep -oP 'from"[^"]*"' "$source" | sort -u | grep -P 'from"(node:|assert|child_process|crypto|events|fs|http|https|module|net|os|path|process|stream|tty|url|util|zlib)'
|
||||
echo ""
|
||||
echo "--- Class Definitions ---"
|
||||
grep -oP 'class \w+( extends \w+)?' "$source" | sort -u
|
||||
} > "${output}/metrics.txt"
|
||||
|
||||
log "Metrics saved to ${output}/metrics.txt"
|
||||
}
|
||||
|
||||
# Split into logical modules based on patterns
|
||||
split_modules() {
|
||||
local source="$1"
|
||||
local output="$2"
|
||||
local modules_dir="${output}/modules"
|
||||
mkdir -p "$modules_dir"
|
||||
|
||||
log "Splitting into logical modules..."
|
||||
|
||||
# Extract tool-related code
|
||||
grep -oP '.{0,200}(BashTool|FileReadTool|FileEditTool|FileWriteTool|AgentOutputTool|WebFetch|WebSearch|TodoWrite|NotebookEdit|GlobTool|GrepTool).{0,200}' \
|
||||
"$source" > "${modules_dir}/tools.txt" 2>/dev/null || true
|
||||
|
||||
# Extract permission-related code
|
||||
grep -oP '.{0,200}(permission|Permission|canUseTool|alwaysAllowRules|denyWrite|sandbox|Sandbox).{0,200}' \
|
||||
"$source" > "${modules_dir}/permissions.txt" 2>/dev/null || true
|
||||
|
||||
# Extract MCP-related code
|
||||
grep -oP '.{0,200}(mcp__|McpClient|McpServer|McpError|callTool|listTools|initialize).{0,200}' \
|
||||
"$source" > "${modules_dir}/mcp.txt" 2>/dev/null || true
|
||||
|
||||
# Extract streaming-related code
|
||||
grep -oP '.{0,200}(content_block_delta|message_start|message_stop|message_delta|content_block_start|content_block_stop|stream_event|text_delta|input_json_delta).{0,200}' \
|
||||
"$source" > "${modules_dir}/streaming.txt" 2>/dev/null || true
|
||||
|
||||
# Extract context/compaction code
|
||||
grep -oP '.{0,200}(compact|compaction|tengu_compact|microcompact|auto_compact|compact_boundary|preCompactTokenCount|postCompactTokenCount).{0,200}' \
|
||||
"$source" > "${modules_dir}/compaction.txt" 2>/dev/null || true
|
||||
|
||||
# Extract agent loop code
|
||||
grep -oP '.{0,200}(agentLoop|mainLoop|s\$\(|querySource|toolUseContext|systemPrompt).{0,200}' \
|
||||
"$source" > "${modules_dir}/agent-loop.txt" 2>/dev/null || true
|
||||
|
||||
# Extract telemetry events
|
||||
grep -oP '"tengu_[^"]*"' "$source" | sort -u > "${modules_dir}/telemetry-events.txt" 2>/dev/null || true
|
||||
|
||||
# Extract string constants (tool names, commands, etc.)
|
||||
grep -oP 'name:"[a-z][-a-z]*",description:"[^"]*"' "$source" | sort -u > "${modules_dir}/commands.txt" 2>/dev/null || true
|
||||
|
||||
# Extract class hierarchy
|
||||
grep -oP 'class \w+ extends \w+' "$source" | sort -u > "${modules_dir}/class-hierarchy.txt" 2>/dev/null || true
|
||||
|
||||
# Count extracted lines per module
|
||||
for f in "${modules_dir}"/*.txt; do
|
||||
local name
|
||||
name=$(basename "$f" .txt)
|
||||
local lines
|
||||
lines=$(wc -l < "$f")
|
||||
log " Module '$name': $lines fragments"
|
||||
done
|
||||
}
|
||||
|
||||
# Generate RVF files from extracted modules
|
||||
generate_rvf() {
|
||||
local modules_dir="$1/modules"
|
||||
local rvf_dir="$1/rvf"
|
||||
mkdir -p "$rvf_dir"
|
||||
|
||||
log "Generating RVF files..."
|
||||
|
||||
local version
|
||||
version=$(grep -oP 'VERSION:"[^"]*"' "$modules_dir/../cli.js.original" 2>/dev/null | head -1 | grep -oP '\d+\.\d+\.\d+' || echo "unknown")
|
||||
|
||||
for f in "${modules_dir}"/*.txt; do
|
||||
local name
|
||||
name=$(basename "$f" .txt)
|
||||
local rvf_file="${rvf_dir}/${name}.rvf"
|
||||
{
|
||||
echo "---"
|
||||
echo "type: source-extraction"
|
||||
echo "module: ${name}"
|
||||
echo "binary: claude-code"
|
||||
echo "version: ${version}"
|
||||
echo "extraction-method: strings+pattern-match"
|
||||
echo "confidence: medium"
|
||||
echo "fragments: $(wc -l < "$f")"
|
||||
echo "---"
|
||||
echo ""
|
||||
echo "# ${name} - Extracted Fragments"
|
||||
echo ""
|
||||
echo '```javascript'
|
||||
cat "$f"
|
||||
echo '```'
|
||||
} > "$rvf_file"
|
||||
log " Created ${rvf_file}"
|
||||
done
|
||||
}
|
||||
|
||||
# Main
|
||||
main() {
|
||||
log "Claude Code Decompiler"
|
||||
log "======================"
|
||||
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
|
||||
find_source
|
||||
|
||||
if [[ -n "$CLI_JS" ]]; then
|
||||
process_bundle "$CLI_JS" "$OUTPUT_DIR"
|
||||
split_modules "$CLI_JS" "$OUTPUT_DIR"
|
||||
generate_rvf "$OUTPUT_DIR"
|
||||
elif [[ -n "$BINARY" ]]; then
|
||||
extract_from_binary "$BINARY" "$OUTPUT_DIR"
|
||||
# If we got enough JS, process it
|
||||
if [[ -f "${OUTPUT_DIR}/js-fragments.txt" ]]; then
|
||||
split_modules "${OUTPUT_DIR}/js-fragments.txt" "$OUTPUT_DIR"
|
||||
generate_rvf "$OUTPUT_DIR"
|
||||
fi
|
||||
fi
|
||||
|
||||
log ""
|
||||
log "Extraction complete!"
|
||||
log "Output directory: $OUTPUT_DIR"
|
||||
log ""
|
||||
log "Key files:"
|
||||
log " metrics.txt - Code metrics and counts"
|
||||
log " cli.beautified.js - Beautified bundle (if from NPM)"
|
||||
log " modules/ - Split by logical module"
|
||||
log " rvf/ - RVF files with metadata headers"
|
||||
|
||||
# Summary
|
||||
if [[ -f "${OUTPUT_DIR}/metrics.txt" ]]; then
|
||||
echo ""
|
||||
head -15 "${OUTPUT_DIR}/metrics.txt"
|
||||
fi
|
||||
}
|
||||
|
||||
main "$@"
|
||||
455
scripts/claude-code-rvf-corpus.sh
Executable file
455
scripts/claude-code-rvf-corpus.sh
Executable file
|
|
@ -0,0 +1,455 @@
|
|||
#!/usr/bin/env bash
|
||||
# claude-code-rvf-corpus.sh - Build binary RVF containers for every major
|
||||
# Claude Code CLI release.
|
||||
#
|
||||
# Downloads the latest patch of each major.minor series from npm, extracts
|
||||
# the CLI bundle, splits into modules, and creates a binary RVF container
|
||||
# with vector embeddings and witness chains.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/claude-code-rvf-corpus.sh [--dry-run] [--series 0.2,1.0,2.0,2.1]
|
||||
#
|
||||
# Output: docs/research/claude-code-rvsource/versions/<vX.Y.z>/
|
||||
# - claude-code-vX.Y.rvf Binary RVF container
|
||||
# - claude-code-vX.Y.rvf.manifest.json Container manifest
|
||||
# - source/ Extracted JS modules
|
||||
# - README.md Version metadata
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
OUTPUT_BASE="${ROOT_DIR}/docs/research/claude-code-rvsource/versions"
|
||||
TMP_DIR="/tmp/cc-rvf-corpus-$$"
|
||||
DRY_RUN=false
|
||||
FILTER_SERIES=""
|
||||
|
||||
# Colors
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
CYAN='\033[0;36m'
|
||||
BOLD='\033[1m'
|
||||
NC='\033[0m'
|
||||
|
||||
log() { echo -e "${GREEN}[+]${NC} $*"; }
|
||||
info() { echo -e "${CYAN}[*]${NC} $*"; }
|
||||
warn() { echo -e "${YELLOW}[!]${NC} $*"; }
|
||||
err() { echo -e "${RED}[-]${NC} $*" >&2; }
|
||||
|
||||
cleanup() {
|
||||
rm -rf "$TMP_DIR"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--dry-run) DRY_RUN=true; shift ;;
|
||||
--series) FILTER_SERIES="$2"; shift 2 ;;
|
||||
--help|-h)
|
||||
echo "Usage: $0 [--dry-run] [--series 0.2,1.0,2.0,2.1]"
|
||||
exit 0
|
||||
;;
|
||||
*) err "Unknown argument: $1"; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Fetch all versions from npm and group by major.minor
|
||||
get_version_groups() {
|
||||
log "Fetching Claude Code versions from npm..." >&2
|
||||
local versions_json
|
||||
versions_json=$(npm view @anthropic-ai/claude-code versions --json 2>/dev/null)
|
||||
|
||||
# Use node to group versions and pick latest patch per major.minor
|
||||
node -e "
|
||||
const versions = $versions_json;
|
||||
const groups = {};
|
||||
|
||||
for (const v of versions) {
|
||||
const parts = v.split('.');
|
||||
const key = parts[0] + '.' + parts[1];
|
||||
const patch = parseInt(parts[2], 10);
|
||||
|
||||
if (!groups[key] || patch > groups[key].patch) {
|
||||
groups[key] = { version: v, patch, key };
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by semver
|
||||
const sorted = Object.values(groups).sort((a, b) => {
|
||||
const [aMaj, aMin] = a.key.split('.').map(Number);
|
||||
const [bMaj, bMin] = b.key.split('.').map(Number);
|
||||
return aMaj !== bMaj ? aMaj - bMaj : aMin - bMin;
|
||||
});
|
||||
|
||||
for (const g of sorted) {
|
||||
console.log(g.key + ' ' + g.version);
|
||||
}
|
||||
"
|
||||
}
|
||||
|
||||
# Download and extract a specific version
|
||||
download_version() {
|
||||
local version="$1"
|
||||
local dest_dir="$2"
|
||||
|
||||
mkdir -p "$dest_dir"
|
||||
info " Downloading @anthropic-ai/claude-code@${version}..."
|
||||
|
||||
local tgz_dir="${TMP_DIR}/tarballs"
|
||||
mkdir -p "$tgz_dir"
|
||||
|
||||
npm pack "@anthropic-ai/claude-code@${version}" --pack-destination "$tgz_dir" \
|
||||
>/dev/null 2>&1
|
||||
|
||||
# Find the tarball (naming varies between npm versions)
|
||||
local tgz
|
||||
tgz=$(ls "$tgz_dir"/anthropic-ai-claude-code-*.tgz 2>/dev/null | head -1)
|
||||
if [[ -z "$tgz" ]]; then
|
||||
err " Failed to download version ${version}"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Try to extract cli.js, then cli.mjs (don't list the tarball, just try)
|
||||
tar xf "$tgz" -C "$dest_dir" --strip-components=1 package/cli.js 2>/dev/null || true
|
||||
tar xf "$tgz" -C "$dest_dir" --strip-components=1 package/cli.mjs 2>/dev/null || true
|
||||
tar xf "$tgz" -C "$dest_dir" --strip-components=1 package/package.json 2>/dev/null || true
|
||||
|
||||
# Rename cli.mjs -> cli.js for consistency
|
||||
if [[ -f "${dest_dir}/cli.mjs" ]] && [[ ! -f "${dest_dir}/cli.js" ]]; then
|
||||
mv "${dest_dir}/cli.mjs" "${dest_dir}/cli.js"
|
||||
fi
|
||||
|
||||
if [[ ! -f "${dest_dir}/cli.js" ]]; then
|
||||
warn " No cli.js or cli.mjs found in ${version}"
|
||||
return 1
|
||||
fi
|
||||
|
||||
rm -f "$tgz"
|
||||
local size
|
||||
size=$(du -sh "${dest_dir}/cli.js" 2>/dev/null | cut -f1)
|
||||
info " Extracted cli.js (${size})"
|
||||
return 0
|
||||
}
|
||||
|
||||
# Split a CLI bundle into modules
|
||||
split_modules() {
|
||||
local cli_path="$1"
|
||||
local source_dir="$2"
|
||||
|
||||
info " Splitting into modules..."
|
||||
node "${SCRIPT_DIR}/lib/module-splitter.mjs" "$cli_path" "$source_dir" 2>/dev/null
|
||||
}
|
||||
|
||||
# Build a binary RVF container
|
||||
build_rvf() {
|
||||
local source_dir="$1"
|
||||
local rvf_path="$2"
|
||||
local version="$3"
|
||||
local series="$4"
|
||||
|
||||
info " Building binary RVF container..."
|
||||
node "${SCRIPT_DIR}/lib/rvf-builder.mjs" \
|
||||
"$source_dir" "$rvf_path" \
|
||||
--meta "version=${version}" \
|
||||
--meta "series=${series}" \
|
||||
--meta "package=@anthropic-ai/claude-code" \
|
||||
--meta "corpus=claude-code-rvsource" \
|
||||
2>/dev/null
|
||||
}
|
||||
|
||||
# Generate a README for a version directory
|
||||
generate_readme() {
|
||||
local ver_dir="$1"
|
||||
local series="$2"
|
||||
local version="$3"
|
||||
local rvf_file="$4"
|
||||
|
||||
local metrics_file="${ver_dir}/source/metrics.json"
|
||||
local manifest_file="${rvf_file}.manifest.json"
|
||||
|
||||
# Read metrics
|
||||
local bundle_size="unknown"
|
||||
local classes="?"
|
||||
local functions="?"
|
||||
local modules_count="?"
|
||||
|
||||
if [[ -f "$metrics_file" ]]; then
|
||||
bundle_size=$(node -e "const m=JSON.parse(require('fs').readFileSync('$metrics_file','utf-8')); console.log((m.sizeBytes/1024/1024).toFixed(1)+'MB')")
|
||||
classes=$(node -e "const m=JSON.parse(require('fs').readFileSync('$metrics_file','utf-8')); console.log(m.classes)")
|
||||
functions=$(node -e "const m=JSON.parse(require('fs').readFileSync('$metrics_file','utf-8')); console.log(m.functions)")
|
||||
modules_count=$(node -e "const m=JSON.parse(require('fs').readFileSync('$metrics_file','utf-8')); console.log(Object.keys(m.modules||{}).length)")
|
||||
fi
|
||||
|
||||
local rvf_size="N/A"
|
||||
local rvf_vectors="N/A"
|
||||
local rvf_id="N/A"
|
||||
if [[ -f "$manifest_file" ]]; then
|
||||
rvf_size=$(node -e "const m=JSON.parse(require('fs').readFileSync('$manifest_file','utf-8')); console.log((m.fileSizeBytes/1024).toFixed(1)+'KB')")
|
||||
rvf_vectors=$(node -e "const m=JSON.parse(require('fs').readFileSync('$manifest_file','utf-8')); console.log(m.totalVectors)")
|
||||
rvf_id=$(node -e "const m=JSON.parse(require('fs').readFileSync('$manifest_file','utf-8')); console.log(m.fileId)")
|
||||
fi
|
||||
|
||||
cat > "${ver_dir}/README.md" <<READMEEOF
|
||||
# Claude Code v${version} (${series} series)
|
||||
|
||||
## Binary RVF Container
|
||||
|
||||
| Property | Value |
|
||||
|----------|-------|
|
||||
| Version | ${version} |
|
||||
| Series | ${series} |
|
||||
| Bundle size | ${bundle_size} |
|
||||
| RVF size | ${rvf_size} |
|
||||
| Vectors | ${rvf_vectors} |
|
||||
| RVF File ID | \`${rvf_id}\` |
|
||||
| Classes | ${classes} |
|
||||
| Functions | ${functions} |
|
||||
| Modules | ${modules_count} |
|
||||
| Extracted | $(date -Iseconds) |
|
||||
|
||||
## Files
|
||||
|
||||
- \`claude-code-v${series}.rvf\` - Binary RVF container with HNSW index + witness chain
|
||||
- \`claude-code-v${series}.rvf.manifest.json\` - Container manifest (vector ID map, metadata)
|
||||
- \`source/\` - Extracted JavaScript module fragments
|
||||
|
||||
## RVF Container Details
|
||||
|
||||
The \`.rvf\` file is a real binary container created with the \`@ruvector/rvf-node\`
|
||||
native backend. It contains:
|
||||
|
||||
- **128-dimensional fingerprint vectors** for each code fragment
|
||||
- **HNSW index** (M=16, ef_construction=200) for fast similarity search
|
||||
- **Cosine distance** metric
|
||||
- **Witness chain** for provenance verification
|
||||
|
||||
To query this container:
|
||||
|
||||
\`\`\`typescript
|
||||
import { RvfDatabase } from '@ruvector/rvf';
|
||||
|
||||
const db = await RvfDatabase.openReadonly('./claude-code-v${series}.rvf');
|
||||
const results = await db.query(queryVector, 10);
|
||||
await db.close();
|
||||
\`\`\`
|
||||
READMEEOF
|
||||
}
|
||||
|
||||
# Generate the top-level index README
|
||||
generate_index() {
|
||||
local base_dir="$1"
|
||||
shift
|
||||
local entries=("$@")
|
||||
|
||||
cat > "${base_dir}/README.md" <<'INDEXHEADER'
|
||||
# Claude Code RVF Corpus
|
||||
|
||||
Binary RVF containers for every major Claude Code CLI release, with
|
||||
HNSW-indexed vector embeddings and witness chains for provenance.
|
||||
|
||||
## Versions
|
||||
|
||||
| Series | Version | Bundle | RVF Size | Vectors | File ID |
|
||||
|--------|---------|--------|----------|---------|---------|
|
||||
INDEXHEADER
|
||||
|
||||
for entry in "${entries[@]}"; do
|
||||
echo "$entry" >> "${base_dir}/README.md"
|
||||
done
|
||||
|
||||
cat >> "${base_dir}/README.md" <<'INDEXFOOTER'
|
||||
|
||||
## How to Use
|
||||
|
||||
```bash
|
||||
# Build the corpus
|
||||
./scripts/claude-code-rvf-corpus.sh
|
||||
|
||||
# Build only specific series
|
||||
./scripts/claude-code-rvf-corpus.sh --series 2.0,2.1
|
||||
```
|
||||
|
||||
## Format
|
||||
|
||||
Each version directory contains:
|
||||
- A binary `.rvf` container (128-dim cosine-distance HNSW index)
|
||||
- A `.manifest.json` sidecar with vector-to-fragment mapping
|
||||
- Extracted JavaScript modules in `source/`
|
||||
|
||||
Generated by `scripts/claude-code-rvf-corpus.sh` using `@ruvector/rvf-node`.
|
||||
INDEXFOOTER
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Main
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
main() {
|
||||
echo -e "${BOLD}Claude Code RVF Corpus Builder${NC}"
|
||||
echo -e "${BOLD}==============================${NC}"
|
||||
echo ""
|
||||
|
||||
mkdir -p "$TMP_DIR" "$OUTPUT_BASE"
|
||||
|
||||
# Get version groups
|
||||
local groups
|
||||
groups=$(get_version_groups)
|
||||
|
||||
if [[ -z "$groups" ]]; then
|
||||
err "No versions found on npm"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
local total_groups
|
||||
total_groups=$(echo "$groups" | wc -l)
|
||||
log "Found ${total_groups} major.minor series"
|
||||
|
||||
# Apply filter if specified
|
||||
if [[ -n "$FILTER_SERIES" ]]; then
|
||||
local filtered=""
|
||||
IFS=',' read -ra FILTER_ARRAY <<< "$FILTER_SERIES"
|
||||
while IFS= read -r line; do
|
||||
local series
|
||||
series=$(echo "$line" | awk '{print $1}')
|
||||
for f in "${FILTER_ARRAY[@]}"; do
|
||||
if [[ "$series" == "$f" ]]; then
|
||||
filtered+="${line}"$'\n'
|
||||
fi
|
||||
done
|
||||
done <<< "$groups"
|
||||
groups="$filtered"
|
||||
total_groups=$(echo -n "$groups" | grep -c '^' || echo 0)
|
||||
log "Filtered to ${total_groups} series: ${FILTER_SERIES}"
|
||||
fi
|
||||
|
||||
if $DRY_RUN; then
|
||||
warn "DRY RUN - would process these versions:"
|
||||
echo "$groups" | while IFS= read -r line; do
|
||||
[[ -z "$line" ]] && continue
|
||||
local series version
|
||||
series=$(echo "$line" | awk '{print $1}')
|
||||
version=$(echo "$line" | awk '{print $2}')
|
||||
echo " v${series}.x -> ${version}"
|
||||
done
|
||||
exit 0
|
||||
fi
|
||||
|
||||
local processed=0
|
||||
local failed=0
|
||||
|
||||
while IFS= read -r line; do
|
||||
[[ -z "$line" ]] && continue
|
||||
|
||||
local series version
|
||||
series=$(echo "$line" | awk '{print $1}')
|
||||
version=$(echo "$line" | awk '{print $2}')
|
||||
|
||||
echo ""
|
||||
log "Processing v${series}.x (latest: ${version})"
|
||||
|
||||
local ver_dir="${OUTPUT_BASE}/v${series}.x"
|
||||
local source_dir="${ver_dir}/source"
|
||||
local rvf_file="${ver_dir}/claude-code-v${series}.rvf"
|
||||
local extract_dir="${TMP_DIR}/extract-${version}"
|
||||
|
||||
mkdir -p "$ver_dir" "$source_dir"
|
||||
|
||||
# Step 1: Download
|
||||
if ! download_version "$version" "$extract_dir"; then
|
||||
warn " Skipping ${version} (download failed)"
|
||||
((failed++)) || true
|
||||
continue
|
||||
fi
|
||||
|
||||
local cli_path="${extract_dir}/cli.js"
|
||||
if [[ ! -f "$cli_path" ]]; then
|
||||
warn " No CLI bundle found for ${version}"
|
||||
((failed++)) || true
|
||||
continue
|
||||
fi
|
||||
|
||||
# Step 2: Split into modules
|
||||
if ! split_modules "$cli_path" "$source_dir"; then
|
||||
warn " Module splitting failed for ${version}"
|
||||
fi
|
||||
|
||||
# Step 3: Build binary RVF container
|
||||
if build_rvf "$source_dir" "$rvf_file" "$version" "$series"; then
|
||||
log " RVF container created: $(basename "$rvf_file")"
|
||||
else
|
||||
warn " RVF creation failed for ${version}"
|
||||
# Create a fallback TODO note
|
||||
cat > "${ver_dir}/TODO-rvf.md" <<EOF
|
||||
# TODO: Create RVF Container
|
||||
|
||||
Version: ${version}
|
||||
Series: v${series}.x
|
||||
Error: RVF binary creation failed
|
||||
|
||||
The source modules have been extracted to \`source/\` but the binary
|
||||
RVF container could not be created. This typically means the
|
||||
\`@ruvector/rvf-node\` native backend is not available.
|
||||
|
||||
To create the container manually:
|
||||
|
||||
\`\`\`bash
|
||||
node scripts/lib/rvf-builder.mjs source/ claude-code-v${series}.rvf \\
|
||||
--meta version=${version} --meta series=${series}
|
||||
\`\`\`
|
||||
EOF
|
||||
fi
|
||||
|
||||
# Step 4: Generate README
|
||||
generate_readme "$ver_dir" "$series" "$version" "$rvf_file"
|
||||
|
||||
# Clean up extracted tarball content
|
||||
rm -rf "$extract_dir"
|
||||
|
||||
((processed++)) || true
|
||||
log " Done (${processed}/${total_groups})"
|
||||
done <<< "$groups"
|
||||
|
||||
# Generate index
|
||||
echo ""
|
||||
log "Generating corpus index..."
|
||||
|
||||
# Rebuild index entries by scanning output dirs
|
||||
local final_entries=()
|
||||
for d in "${OUTPUT_BASE}"/v*.x; do
|
||||
[[ -d "$d" ]] || continue
|
||||
local dir_name
|
||||
dir_name=$(basename "$d")
|
||||
local series_name="${dir_name#v}"
|
||||
series_name="${series_name%.x}"
|
||||
|
||||
local manifest="${d}/claude-code-v${series_name}.rvf.manifest.json"
|
||||
if [[ -f "$manifest" ]]; then
|
||||
local row
|
||||
row=$(node -e "
|
||||
const m=JSON.parse(require('fs').readFileSync('$manifest','utf-8'));
|
||||
const s=m.source||{};
|
||||
const met=s.metrics||{};
|
||||
const bundle=(met.bundleSizeBytes/1024/1024).toFixed(1)+'MB';
|
||||
const rvfSize=(m.fileSizeBytes/1024).toFixed(1)+'KB';
|
||||
console.log('| ${series_name} | '+s.version+' | '+bundle+' | '+rvfSize+' | '+m.totalVectors+' | \`'+m.fileId.slice(0,12)+'...\` |');
|
||||
" 2>/dev/null || echo "| ${series_name} | ? | ? | ? | ? | ? |")
|
||||
final_entries+=("$row")
|
||||
else
|
||||
final_entries+=("| ${series_name} | ? | ? | N/A | N/A | N/A |")
|
||||
fi
|
||||
done
|
||||
|
||||
generate_index "$OUTPUT_BASE" "${final_entries[@]}"
|
||||
|
||||
echo ""
|
||||
echo -e "${BOLD}Corpus build complete.${NC}"
|
||||
log "Output: ${OUTPUT_BASE}/"
|
||||
log "Versions processed: ${processed:-0}"
|
||||
if [[ ${failed:-0} -gt 0 ]]; then
|
||||
warn "Versions failed: ${failed}"
|
||||
fi
|
||||
}
|
||||
|
||||
main "$@"
|
||||
|
|
@ -96,7 +96,7 @@ gcloud run deploy "${SERVICE_NAME}" \
|
|||
--set-env-vars="DRAGNES_BRAIN_URL=https://pi.ruv.io" \
|
||||
--set-env-vars="DRAGNES_MODEL_VERSION=0.1.0" \
|
||||
--update-secrets="OPENAI_API_KEY=OPENROUTER_API_KEY:latest" \
|
||||
--set-env-vars='MCP_SERVERS=[{"name":"pi-brain","url":"https://pi.ruv.io/sse"}]'
|
||||
--set-env-vars='MCP_SERVERS=[{"name":"pi-brain","url":"https://mcp.pi.ruv.io"}]'
|
||||
|
||||
# ---------- CDN for WASM assets -----------------------------------------------
|
||||
|
||||
|
|
|
|||
211
scripts/lib/module-splitter.mjs
Executable file
211
scripts/lib/module-splitter.mjs
Executable file
|
|
@ -0,0 +1,211 @@
|
|||
#!/usr/bin/env node
|
||||
/**
|
||||
* module-splitter.mjs - Split a Claude Code CLI bundle into logical modules.
|
||||
*
|
||||
* Given a path to cli.js / cli.mjs, extracts recognizable subsystems
|
||||
* (tools, MCP, permissions, streaming, agent-loop, compaction, telemetry)
|
||||
* and writes individual .js files plus a metrics.json manifest.
|
||||
*
|
||||
* Usage:
|
||||
* node scripts/lib/module-splitter.mjs <cli-bundle> <output-dir>
|
||||
*/
|
||||
|
||||
import { readFileSync, writeFileSync, mkdirSync, statSync } from 'fs';
|
||||
import { join, basename } from 'path';
|
||||
|
||||
// Module extraction: keyword -> module name.
|
||||
// A line containing the keyword is assigned to that module.
|
||||
// Order matters: first match wins for each line.
|
||||
const MODULE_KEYWORDS = {
|
||||
'tool-dispatch': [
|
||||
'BashTool', 'FileReadTool', 'FileEditTool', 'FileWriteTool',
|
||||
'AgentOutputTool', 'WebFetch', 'WebSearch', 'TodoWrite',
|
||||
'NotebookEdit', 'GlobTool', 'GrepTool',
|
||||
],
|
||||
'permission-system': [
|
||||
'canUseTool', 'alwaysAllowRules', 'denyWrite',
|
||||
'Permission', 'permission',
|
||||
],
|
||||
'mcp-client': [
|
||||
'mcp__', 'McpClient', 'McpServer', 'McpError',
|
||||
'callTool', 'listTools',
|
||||
],
|
||||
'streaming-handler': [
|
||||
'content_block_delta', 'message_start', 'message_stop',
|
||||
'message_delta', 'content_block_start', 'content_block_stop',
|
||||
'stream_event', 'text_delta', 'input_json_delta',
|
||||
],
|
||||
'context-manager': [
|
||||
'tengu_compact', 'microcompact', 'auto_compact',
|
||||
'compact_boundary', 'preCompactTokenCount',
|
||||
'postCompactTokenCount', 'compaction',
|
||||
],
|
||||
'agent-loop': [
|
||||
'agentLoop', 'mainLoop', 'querySource',
|
||||
'toolUseContext', 'systemPrompt',
|
||||
],
|
||||
};
|
||||
|
||||
// Simple global regex patterns for small, fast extractions.
|
||||
const SIMPLE_PATTERNS = {
|
||||
telemetry: /"tengu_[^"]*"/g,
|
||||
commands: /name:"[a-z][-a-z]*",description:"[^"]*"/g,
|
||||
'class-hierarchy': /class \w+( extends \w+)?/g,
|
||||
};
|
||||
|
||||
/**
|
||||
* Split source into statements (semicolon-delimited chunks).
|
||||
* For minified bundles, this gives us logical units.
|
||||
*/
|
||||
function splitStatements(source) {
|
||||
// Split on semicolons that are not inside strings.
|
||||
// For minified JS, simple semicolon split works well enough.
|
||||
// Limit chunk size to ~2KB for vector embedding granularity.
|
||||
const MAX_CHUNK = 2048;
|
||||
const raw = source.split(';');
|
||||
const chunks = [];
|
||||
let buffer = '';
|
||||
|
||||
for (const part of raw) {
|
||||
if (buffer.length + part.length > MAX_CHUNK && buffer.length > 0) {
|
||||
chunks.push(buffer);
|
||||
buffer = part;
|
||||
} else {
|
||||
buffer += (buffer ? ';' : '') + part;
|
||||
}
|
||||
}
|
||||
if (buffer.length > 0) chunks.push(buffer);
|
||||
return chunks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Assign statements to modules based on keyword matching.
|
||||
*/
|
||||
function classifyStatements(statements) {
|
||||
const modules = {};
|
||||
|
||||
for (const stmt of statements) {
|
||||
if (stmt.length < 10) continue;
|
||||
|
||||
for (const [modName, keywords] of Object.entries(MODULE_KEYWORDS)) {
|
||||
const matched = keywords.some((kw) => stmt.includes(kw));
|
||||
if (matched) {
|
||||
if (!modules[modName]) modules[modName] = [];
|
||||
modules[modName].push(stmt.trim());
|
||||
break; // first-match wins
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return modules;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract simple pattern matches (telemetry events, commands, classes).
|
||||
*/
|
||||
function extractSimplePatterns(source) {
|
||||
const results = {};
|
||||
|
||||
for (const [modName, pattern] of Object.entries(SIMPLE_PATTERNS)) {
|
||||
pattern.lastIndex = 0;
|
||||
const matches = new Set();
|
||||
let m;
|
||||
while ((m = pattern.exec(source)) !== null) {
|
||||
const frag = m[0].trim();
|
||||
if (frag.length > 3) matches.add(frag);
|
||||
}
|
||||
if (matches.size > 0) {
|
||||
results[modName] = [...matches];
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute basic metrics about the CLI bundle.
|
||||
*/
|
||||
function computeMetrics(source, filePath) {
|
||||
const sizeBytes = statSync(filePath).size;
|
||||
const versionMatch = source.match(/VERSION[=:]"?(\d+\.\d+\.\d+)/);
|
||||
const version = versionMatch ? versionMatch[1] : 'unknown';
|
||||
|
||||
return {
|
||||
version,
|
||||
sizeBytes,
|
||||
lines: source.split('\n').length,
|
||||
functions: (source.match(/function\s*\w*\s*\(/g) || []).length,
|
||||
asyncFunctions: (source.match(/async\s+function/g) || []).length,
|
||||
arrowFunctions: (source.match(/=>/g) || []).length,
|
||||
classes: (source.match(/class \w+/g) || []).length,
|
||||
extends: (source.match(/extends \w+/g) || []).length,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry point.
|
||||
*/
|
||||
function main() {
|
||||
const [bundlePath, outputDir] = process.argv.slice(2);
|
||||
if (!bundlePath || !outputDir) {
|
||||
console.error('Usage: node module-splitter.mjs <cli-bundle> <output-dir>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
mkdirSync(outputDir, { recursive: true });
|
||||
|
||||
console.log(`Reading bundle: ${bundlePath}`);
|
||||
const source = readFileSync(bundlePath, 'utf-8');
|
||||
const metrics = computeMetrics(source, bundlePath);
|
||||
console.log(` Size: ${(metrics.sizeBytes / 1024 / 1024).toFixed(1)} MB, ` +
|
||||
`${metrics.classes} classes, ${metrics.functions} functions`);
|
||||
|
||||
// Phase 1: statement-based classification (fast, O(n) per keyword set)
|
||||
console.log(' Splitting into statements...');
|
||||
const statements = splitStatements(source);
|
||||
console.log(` ${statements.length} statements`);
|
||||
|
||||
const classified = classifyStatements(statements);
|
||||
const moduleResults = {};
|
||||
|
||||
for (const [modName, fragments] of Object.entries(classified)) {
|
||||
const outFile = join(outputDir, `${modName}.js`);
|
||||
writeFileSync(outFile, fragments.join('\n\n'), 'utf-8');
|
||||
moduleResults[modName] = {
|
||||
fragments: fragments.length,
|
||||
sizeBytes: Buffer.byteLength(fragments.join('\n\n')),
|
||||
};
|
||||
console.log(` Module "${modName}": ${fragments.length} fragments`);
|
||||
}
|
||||
|
||||
// Phase 2: simple pattern extractions (telemetry, commands, classes)
|
||||
console.log(' Extracting simple patterns...');
|
||||
const simple = extractSimplePatterns(source);
|
||||
|
||||
for (const [modName, fragments] of Object.entries(simple)) {
|
||||
const outFile = join(outputDir, `${modName}.js`);
|
||||
writeFileSync(outFile, fragments.join('\n'), 'utf-8');
|
||||
moduleResults[modName] = {
|
||||
fragments: fragments.length,
|
||||
sizeBytes: Buffer.byteLength(fragments.join('\n')),
|
||||
};
|
||||
console.log(` Module "${modName}": ${fragments.length} fragments`);
|
||||
}
|
||||
|
||||
// Write metrics manifest
|
||||
const manifest = {
|
||||
...metrics,
|
||||
sourceFile: basename(bundlePath),
|
||||
extractedAt: new Date().toISOString(),
|
||||
modules: moduleResults,
|
||||
};
|
||||
writeFileSync(
|
||||
join(outputDir, 'metrics.json'),
|
||||
JSON.stringify(manifest, null, 2)
|
||||
);
|
||||
|
||||
// Output JSON summary to stdout for the caller script
|
||||
console.log(JSON.stringify(manifest));
|
||||
}
|
||||
|
||||
main();
|
||||
259
scripts/lib/rvf-builder.mjs
Executable file
259
scripts/lib/rvf-builder.mjs
Executable file
|
|
@ -0,0 +1,259 @@
|
|||
#!/usr/bin/env node
|
||||
/**
|
||||
* rvf-builder.mjs - Create binary RVF containers from extracted source modules.
|
||||
*
|
||||
* Uses the @ruvector/rvf-node native backend to produce real binary .rvf files
|
||||
* with HNSW-indexed vector embeddings and witness chains.
|
||||
*
|
||||
* Each source fragment is embedded as a deterministic vector derived from its
|
||||
* content hash (a lightweight "fingerprint" embedding). This allows similarity
|
||||
* search across versions without requiring a full ML embedding model.
|
||||
*
|
||||
* Usage:
|
||||
* node scripts/lib/rvf-builder.mjs <source-dir> <output.rvf> [--meta key=val ...]
|
||||
*
|
||||
* source-dir : directory with .js module files + metrics.json
|
||||
* output.rvf : path for the binary RVF container
|
||||
* --meta : optional key=value metadata pairs
|
||||
*/
|
||||
|
||||
import { readFileSync, readdirSync, existsSync, writeFileSync } from 'fs';
|
||||
import { join, basename, resolve } from 'path';
|
||||
import { createHash } from 'crypto';
|
||||
|
||||
// Vector dimension for fingerprint embeddings
|
||||
const DIMENSIONS = 128;
|
||||
|
||||
/**
|
||||
* Generate a deterministic fingerprint vector from text content.
|
||||
*
|
||||
* Uses SHA-256 → expand to DIMENSIONS floats in [-1, 1].
|
||||
* This is NOT a semantic embedding but a content fingerprint that
|
||||
* allows exact-match deduplication and change detection across versions.
|
||||
*/
|
||||
function fingerprintVector(text) {
|
||||
const hash = createHash('sha256').update(text).digest();
|
||||
const vec = new Float32Array(DIMENSIONS);
|
||||
|
||||
// Expand 32 bytes of hash into DIMENSIONS floats using a simple
|
||||
// deterministic expansion: for each float, mix two hash bytes.
|
||||
for (let i = 0; i < DIMENSIONS; i++) {
|
||||
const byteA = hash[i % 32];
|
||||
const byteB = hash[(i * 7 + 13) % 32];
|
||||
// Map to [-1, 1]
|
||||
vec[i] = ((byteA * 256 + byteB) / 65535) * 2 - 1;
|
||||
}
|
||||
|
||||
// Normalize to unit length for cosine distance
|
||||
let norm = 0;
|
||||
for (let i = 0; i < DIMENSIONS; i++) norm += vec[i] * vec[i];
|
||||
norm = Math.sqrt(norm);
|
||||
if (norm > 0) {
|
||||
for (let i = 0; i < DIMENSIONS; i++) vec[i] /= norm;
|
||||
}
|
||||
|
||||
return vec;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the native rvf-node backend.
|
||||
*/
|
||||
async function loadRvfNode() {
|
||||
// Try several possible paths for the native module
|
||||
const candidates = [
|
||||
resolve(process.cwd(), 'npm/packages/rvf-node/index.js'),
|
||||
resolve(process.cwd(), 'node_modules/@ruvector/rvf-node/index.js'),
|
||||
];
|
||||
|
||||
for (const p of candidates) {
|
||||
if (existsSync(p)) {
|
||||
const mod = await import(p);
|
||||
return mod.RvfDatabase ?? mod.default?.RvfDatabase ?? mod;
|
||||
}
|
||||
}
|
||||
throw new Error(
|
||||
'Could not find @ruvector/rvf-node. Tried:\n ' + candidates.join('\n ')
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse --meta key=value arguments from argv.
|
||||
*/
|
||||
function parseMeta(argv) {
|
||||
const meta = {};
|
||||
for (let i = 0; i < argv.length; i++) {
|
||||
if (argv[i] === '--meta' && argv[i + 1]) {
|
||||
const [k, ...rest] = argv[i + 1].split('=');
|
||||
meta[k] = rest.join('=');
|
||||
i++;
|
||||
}
|
||||
}
|
||||
return meta;
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry point.
|
||||
*/
|
||||
async function main() {
|
||||
const args = process.argv.slice(2);
|
||||
const sourceDir = args[0];
|
||||
const outputRvf = args[1];
|
||||
|
||||
if (!sourceDir || !outputRvf) {
|
||||
console.error(
|
||||
'Usage: node rvf-builder.mjs <source-dir> <output.rvf> [--meta key=val ...]'
|
||||
);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const meta = parseMeta(args.slice(2));
|
||||
|
||||
// Load native RVF module
|
||||
let RvfDatabase;
|
||||
try {
|
||||
RvfDatabase = await loadRvfNode();
|
||||
} catch (err) {
|
||||
console.error('Failed to load @ruvector/rvf-node:', err.message);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Read metrics if available
|
||||
const metricsPath = join(sourceDir, 'metrics.json');
|
||||
let metrics = {};
|
||||
if (existsSync(metricsPath)) {
|
||||
metrics = JSON.parse(readFileSync(metricsPath, 'utf-8'));
|
||||
}
|
||||
|
||||
// Collect all .js module files
|
||||
const moduleFiles = readdirSync(sourceDir)
|
||||
.filter((f) => f.endsWith('.js'))
|
||||
.sort();
|
||||
|
||||
if (moduleFiles.length === 0) {
|
||||
console.error(`No .js module files found in ${sourceDir}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(
|
||||
`Building RVF container: ${basename(outputRvf)} (${moduleFiles.length} modules, ${DIMENSIONS}d vectors)`
|
||||
);
|
||||
|
||||
// Create the RVF database
|
||||
const db = RvfDatabase.create(outputRvf, {
|
||||
dimension: DIMENSIONS,
|
||||
metric: 'Cosine',
|
||||
profile: 0,
|
||||
compression: 'None',
|
||||
signing: false,
|
||||
m: 16,
|
||||
ef_construction: 200,
|
||||
});
|
||||
|
||||
// Ingest vectors for each module fragment
|
||||
let totalFragments = 0;
|
||||
let vectorId = 1;
|
||||
const idMap = {};
|
||||
|
||||
for (const modFile of moduleFiles) {
|
||||
const modName = basename(modFile, '.js');
|
||||
const content = readFileSync(join(sourceDir, modFile), 'utf-8');
|
||||
const fragments = content.split('\n\n').filter((f) => f.trim().length > 10);
|
||||
|
||||
if (fragments.length === 0) continue;
|
||||
|
||||
// Build a flat vector array and IDs for batch ingest
|
||||
const vectors = new Float32Array(fragments.length * DIMENSIONS);
|
||||
const ids = [];
|
||||
|
||||
for (let i = 0; i < fragments.length; i++) {
|
||||
const vec = fingerprintVector(fragments[i]);
|
||||
vectors.set(vec, i * DIMENSIONS);
|
||||
ids.push(vectorId);
|
||||
idMap[vectorId] = {
|
||||
module: modName,
|
||||
fragmentIndex: i,
|
||||
sizeBytes: Buffer.byteLength(fragments[i]),
|
||||
hash: createHash('sha256').update(fragments[i]).digest('hex').slice(0, 16),
|
||||
};
|
||||
vectorId++;
|
||||
}
|
||||
|
||||
const result = db.ingestBatch(vectors, ids);
|
||||
totalFragments += result.accepted;
|
||||
console.log(
|
||||
` ${modName}: ${result.accepted} vectors ingested (${fragments.length} fragments)`
|
||||
);
|
||||
}
|
||||
|
||||
// Get final status
|
||||
const status = db.status();
|
||||
const fileId = db.fileId();
|
||||
const segments = db.segments();
|
||||
|
||||
// Write the ID mapping sidecar (extends the default .idmap.json)
|
||||
const sidecarPath = outputRvf + '.manifest.json';
|
||||
const manifest = {
|
||||
format: 'rvf-binary',
|
||||
version: '1.0',
|
||||
fileId,
|
||||
dimensions: DIMENSIONS,
|
||||
metric: 'cosine',
|
||||
totalVectors: status.totalVectors,
|
||||
totalSegments: status.totalSegments,
|
||||
fileSizeBytes: status.fileSize,
|
||||
epoch: status.currentEpoch,
|
||||
segments: segments.map((s) => ({
|
||||
id: s.id,
|
||||
type: s.segType,
|
||||
offset: s.offset,
|
||||
payloadLength: s.payloadLength,
|
||||
})),
|
||||
source: {
|
||||
package: meta.package || '@anthropic-ai/claude-code',
|
||||
version: meta.version || metrics.version || 'unknown',
|
||||
extractedAt: metrics.extractedAt || new Date().toISOString(),
|
||||
metrics: {
|
||||
bundleSizeBytes: metrics.sizeBytes || 0,
|
||||
classes: metrics.classes || 0,
|
||||
functions: metrics.functions || 0,
|
||||
asyncFunctions: metrics.asyncFunctions || 0,
|
||||
arrowFunctions: metrics.arrowFunctions || 0,
|
||||
},
|
||||
},
|
||||
modules: Object.entries(metrics.modules || {}).map(([name, info]) => ({
|
||||
name,
|
||||
...info,
|
||||
})),
|
||||
idMap,
|
||||
meta,
|
||||
createdAt: new Date().toISOString(),
|
||||
};
|
||||
|
||||
writeFileSync(sidecarPath, JSON.stringify(manifest, null, 2));
|
||||
|
||||
db.close();
|
||||
|
||||
console.log(`\nRVF container created successfully:`);
|
||||
console.log(` File: ${outputRvf}`);
|
||||
console.log(` File ID: ${fileId}`);
|
||||
console.log(` Vectors: ${totalFragments}`);
|
||||
console.log(` Segments: ${status.totalSegments}`);
|
||||
console.log(` Size: ${(status.fileSize / 1024).toFixed(1)} KB`);
|
||||
console.log(` Manifest: ${sidecarPath}`);
|
||||
|
||||
// Output JSON for caller
|
||||
const result = {
|
||||
success: true,
|
||||
path: outputRvf,
|
||||
fileId,
|
||||
vectors: totalFragments,
|
||||
segments: status.totalSegments,
|
||||
sizeBytes: status.fileSize,
|
||||
};
|
||||
console.log(JSON.stringify(result));
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error('Fatal error:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue