mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-23 04:27:11 +00:00
SSE Proxy Decoupling (ADR-130): - Fix ruvbrain-sse proxy: proper MCP handshake, session creation, drain polling - Fix internal queue endpoints: session_create keeps receiver, drain returns buffered messages - Add response_queues to AppState for SSE proxy communication - Skip sparsifier for >5M edge graphs (was crashing on 16M edges) - Add SSE_DISABLED/MAX_SSE env vars for configurable connection limits - Route SSE to dedicated mcp.pi.ruv.io subdomain (Cloudflare CNAME) - Serve SSE at root / path on proxy (no /sse needed) - Update all references from pi.ruv.io/sse to mcp.pi.ruv.io - Fix Dockerfile consciousness crate build (feature/version mismatches) Claude Code CLI Source Research (ADR-133): - 19 research documents analyzing Claude Code internals (3000+ lines) - Decompiler script + RVF corpus builder for all major versions - Binary RVF containers for v0.2, v1.0, v2.0, v2.1 (300-2068 vectors each) - Call graphs, class hierarchies, state machines from minified source Integration Strategy (ADR-134): - 6-tier integration plan: WASM MCP, agents, hooks, cache, SDK, plugin - Integration guide with architecture diagrams and performance targets Co-Authored-By: claude-flow <ruv@ruv.net>
455 lines
14 KiB
Bash
Executable file
455 lines
14 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# claude-code-rvf-corpus.sh - Build binary RVF containers for every major
|
|
# Claude Code CLI release.
|
|
#
|
|
# Downloads the latest patch of each major.minor series from npm, extracts
|
|
# the CLI bundle, splits into modules, and creates a binary RVF container
|
|
# with vector embeddings and witness chains.
|
|
#
|
|
# Usage:
|
|
# ./scripts/claude-code-rvf-corpus.sh [--dry-run] [--series 0.2,1.0,2.0,2.1]
|
|
#
|
|
# Output: docs/research/claude-code-rvsource/versions/<vX.Y.z>/
|
|
# - claude-code-vX.Y.rvf Binary RVF container
|
|
# - claude-code-vX.Y.rvf.manifest.json Container manifest
|
|
# - source/ Extracted JS modules
|
|
# - README.md Version metadata
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
OUTPUT_BASE="${ROOT_DIR}/docs/research/claude-code-rvsource/versions"
|
|
TMP_DIR="/tmp/cc-rvf-corpus-$$"
|
|
DRY_RUN=false
|
|
FILTER_SERIES=""
|
|
|
|
# Colors
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
CYAN='\033[0;36m'
|
|
BOLD='\033[1m'
|
|
NC='\033[0m'
|
|
|
|
log() { echo -e "${GREEN}[+]${NC} $*"; }
|
|
info() { echo -e "${CYAN}[*]${NC} $*"; }
|
|
warn() { echo -e "${YELLOW}[!]${NC} $*"; }
|
|
err() { echo -e "${RED}[-]${NC} $*" >&2; }
|
|
|
|
cleanup() {
|
|
rm -rf "$TMP_DIR"
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
# Parse arguments
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--dry-run) DRY_RUN=true; shift ;;
|
|
--series) FILTER_SERIES="$2"; shift 2 ;;
|
|
--help|-h)
|
|
echo "Usage: $0 [--dry-run] [--series 0.2,1.0,2.0,2.1]"
|
|
exit 0
|
|
;;
|
|
*) err "Unknown argument: $1"; exit 1 ;;
|
|
esac
|
|
done
|
|
|
|
# Fetch all versions from npm and group by major.minor
|
|
get_version_groups() {
|
|
log "Fetching Claude Code versions from npm..." >&2
|
|
local versions_json
|
|
versions_json=$(npm view @anthropic-ai/claude-code versions --json 2>/dev/null)
|
|
|
|
# Use node to group versions and pick latest patch per major.minor
|
|
node -e "
|
|
const versions = $versions_json;
|
|
const groups = {};
|
|
|
|
for (const v of versions) {
|
|
const parts = v.split('.');
|
|
const key = parts[0] + '.' + parts[1];
|
|
const patch = parseInt(parts[2], 10);
|
|
|
|
if (!groups[key] || patch > groups[key].patch) {
|
|
groups[key] = { version: v, patch, key };
|
|
}
|
|
}
|
|
|
|
// Sort by semver
|
|
const sorted = Object.values(groups).sort((a, b) => {
|
|
const [aMaj, aMin] = a.key.split('.').map(Number);
|
|
const [bMaj, bMin] = b.key.split('.').map(Number);
|
|
return aMaj !== bMaj ? aMaj - bMaj : aMin - bMin;
|
|
});
|
|
|
|
for (const g of sorted) {
|
|
console.log(g.key + ' ' + g.version);
|
|
}
|
|
"
|
|
}
|
|
|
|
# Download and extract a specific version
|
|
download_version() {
|
|
local version="$1"
|
|
local dest_dir="$2"
|
|
|
|
mkdir -p "$dest_dir"
|
|
info " Downloading @anthropic-ai/claude-code@${version}..."
|
|
|
|
local tgz_dir="${TMP_DIR}/tarballs"
|
|
mkdir -p "$tgz_dir"
|
|
|
|
npm pack "@anthropic-ai/claude-code@${version}" --pack-destination "$tgz_dir" \
|
|
>/dev/null 2>&1
|
|
|
|
# Find the tarball (naming varies between npm versions)
|
|
local tgz
|
|
tgz=$(ls "$tgz_dir"/anthropic-ai-claude-code-*.tgz 2>/dev/null | head -1)
|
|
if [[ -z "$tgz" ]]; then
|
|
err " Failed to download version ${version}"
|
|
return 1
|
|
fi
|
|
|
|
# Try to extract cli.js, then cli.mjs (don't list the tarball, just try)
|
|
tar xf "$tgz" -C "$dest_dir" --strip-components=1 package/cli.js 2>/dev/null || true
|
|
tar xf "$tgz" -C "$dest_dir" --strip-components=1 package/cli.mjs 2>/dev/null || true
|
|
tar xf "$tgz" -C "$dest_dir" --strip-components=1 package/package.json 2>/dev/null || true
|
|
|
|
# Rename cli.mjs -> cli.js for consistency
|
|
if [[ -f "${dest_dir}/cli.mjs" ]] && [[ ! -f "${dest_dir}/cli.js" ]]; then
|
|
mv "${dest_dir}/cli.mjs" "${dest_dir}/cli.js"
|
|
fi
|
|
|
|
if [[ ! -f "${dest_dir}/cli.js" ]]; then
|
|
warn " No cli.js or cli.mjs found in ${version}"
|
|
return 1
|
|
fi
|
|
|
|
rm -f "$tgz"
|
|
local size
|
|
size=$(du -sh "${dest_dir}/cli.js" 2>/dev/null | cut -f1)
|
|
info " Extracted cli.js (${size})"
|
|
return 0
|
|
}
|
|
|
|
# Split a CLI bundle into modules
|
|
split_modules() {
|
|
local cli_path="$1"
|
|
local source_dir="$2"
|
|
|
|
info " Splitting into modules..."
|
|
node "${SCRIPT_DIR}/lib/module-splitter.mjs" "$cli_path" "$source_dir" 2>/dev/null
|
|
}
|
|
|
|
# Build a binary RVF container
|
|
build_rvf() {
|
|
local source_dir="$1"
|
|
local rvf_path="$2"
|
|
local version="$3"
|
|
local series="$4"
|
|
|
|
info " Building binary RVF container..."
|
|
node "${SCRIPT_DIR}/lib/rvf-builder.mjs" \
|
|
"$source_dir" "$rvf_path" \
|
|
--meta "version=${version}" \
|
|
--meta "series=${series}" \
|
|
--meta "package=@anthropic-ai/claude-code" \
|
|
--meta "corpus=claude-code-rvsource" \
|
|
2>/dev/null
|
|
}
|
|
|
|
# Generate a README for a version directory
|
|
generate_readme() {
|
|
local ver_dir="$1"
|
|
local series="$2"
|
|
local version="$3"
|
|
local rvf_file="$4"
|
|
|
|
local metrics_file="${ver_dir}/source/metrics.json"
|
|
local manifest_file="${rvf_file}.manifest.json"
|
|
|
|
# Read metrics
|
|
local bundle_size="unknown"
|
|
local classes="?"
|
|
local functions="?"
|
|
local modules_count="?"
|
|
|
|
if [[ -f "$metrics_file" ]]; then
|
|
bundle_size=$(node -e "const m=JSON.parse(require('fs').readFileSync('$metrics_file','utf-8')); console.log((m.sizeBytes/1024/1024).toFixed(1)+'MB')")
|
|
classes=$(node -e "const m=JSON.parse(require('fs').readFileSync('$metrics_file','utf-8')); console.log(m.classes)")
|
|
functions=$(node -e "const m=JSON.parse(require('fs').readFileSync('$metrics_file','utf-8')); console.log(m.functions)")
|
|
modules_count=$(node -e "const m=JSON.parse(require('fs').readFileSync('$metrics_file','utf-8')); console.log(Object.keys(m.modules||{}).length)")
|
|
fi
|
|
|
|
local rvf_size="N/A"
|
|
local rvf_vectors="N/A"
|
|
local rvf_id="N/A"
|
|
if [[ -f "$manifest_file" ]]; then
|
|
rvf_size=$(node -e "const m=JSON.parse(require('fs').readFileSync('$manifest_file','utf-8')); console.log((m.fileSizeBytes/1024).toFixed(1)+'KB')")
|
|
rvf_vectors=$(node -e "const m=JSON.parse(require('fs').readFileSync('$manifest_file','utf-8')); console.log(m.totalVectors)")
|
|
rvf_id=$(node -e "const m=JSON.parse(require('fs').readFileSync('$manifest_file','utf-8')); console.log(m.fileId)")
|
|
fi
|
|
|
|
cat > "${ver_dir}/README.md" <<READMEEOF
|
|
# Claude Code v${version} (${series} series)
|
|
|
|
## Binary RVF Container
|
|
|
|
| Property | Value |
|
|
|----------|-------|
|
|
| Version | ${version} |
|
|
| Series | ${series} |
|
|
| Bundle size | ${bundle_size} |
|
|
| RVF size | ${rvf_size} |
|
|
| Vectors | ${rvf_vectors} |
|
|
| RVF File ID | \`${rvf_id}\` |
|
|
| Classes | ${classes} |
|
|
| Functions | ${functions} |
|
|
| Modules | ${modules_count} |
|
|
| Extracted | $(date -Iseconds) |
|
|
|
|
## Files
|
|
|
|
- \`claude-code-v${series}.rvf\` - Binary RVF container with HNSW index + witness chain
|
|
- \`claude-code-v${series}.rvf.manifest.json\` - Container manifest (vector ID map, metadata)
|
|
- \`source/\` - Extracted JavaScript module fragments
|
|
|
|
## RVF Container Details
|
|
|
|
The \`.rvf\` file is a real binary container created with the \`@ruvector/rvf-node\`
|
|
native backend. It contains:
|
|
|
|
- **128-dimensional fingerprint vectors** for each code fragment
|
|
- **HNSW index** (M=16, ef_construction=200) for fast similarity search
|
|
- **Cosine distance** metric
|
|
- **Witness chain** for provenance verification
|
|
|
|
To query this container:
|
|
|
|
\`\`\`typescript
|
|
import { RvfDatabase } from '@ruvector/rvf';
|
|
|
|
const db = await RvfDatabase.openReadonly('./claude-code-v${series}.rvf');
|
|
const results = await db.query(queryVector, 10);
|
|
await db.close();
|
|
\`\`\`
|
|
READMEEOF
|
|
}
|
|
|
|
# Generate the top-level index README
|
|
generate_index() {
|
|
local base_dir="$1"
|
|
shift
|
|
local entries=("$@")
|
|
|
|
cat > "${base_dir}/README.md" <<'INDEXHEADER'
|
|
# Claude Code RVF Corpus
|
|
|
|
Binary RVF containers for every major Claude Code CLI release, with
|
|
HNSW-indexed vector embeddings and witness chains for provenance.
|
|
|
|
## Versions
|
|
|
|
| Series | Version | Bundle | RVF Size | Vectors | File ID |
|
|
|--------|---------|--------|----------|---------|---------|
|
|
INDEXHEADER
|
|
|
|
for entry in "${entries[@]}"; do
|
|
echo "$entry" >> "${base_dir}/README.md"
|
|
done
|
|
|
|
cat >> "${base_dir}/README.md" <<'INDEXFOOTER'
|
|
|
|
## How to Use
|
|
|
|
```bash
|
|
# Build the corpus
|
|
./scripts/claude-code-rvf-corpus.sh
|
|
|
|
# Build only specific series
|
|
./scripts/claude-code-rvf-corpus.sh --series 2.0,2.1
|
|
```
|
|
|
|
## Format
|
|
|
|
Each version directory contains:
|
|
- A binary `.rvf` container (128-dim cosine-distance HNSW index)
|
|
- A `.manifest.json` sidecar with vector-to-fragment mapping
|
|
- Extracted JavaScript modules in `source/`
|
|
|
|
Generated by `scripts/claude-code-rvf-corpus.sh` using `@ruvector/rvf-node`.
|
|
INDEXFOOTER
|
|
}
|
|
|
|
# -----------------------------------------------------------------------
|
|
# Main
|
|
# -----------------------------------------------------------------------
|
|
|
|
main() {
|
|
echo -e "${BOLD}Claude Code RVF Corpus Builder${NC}"
|
|
echo -e "${BOLD}==============================${NC}"
|
|
echo ""
|
|
|
|
mkdir -p "$TMP_DIR" "$OUTPUT_BASE"
|
|
|
|
# Get version groups
|
|
local groups
|
|
groups=$(get_version_groups)
|
|
|
|
if [[ -z "$groups" ]]; then
|
|
err "No versions found on npm"
|
|
exit 1
|
|
fi
|
|
|
|
local total_groups
|
|
total_groups=$(echo "$groups" | wc -l)
|
|
log "Found ${total_groups} major.minor series"
|
|
|
|
# Apply filter if specified
|
|
if [[ -n "$FILTER_SERIES" ]]; then
|
|
local filtered=""
|
|
IFS=',' read -ra FILTER_ARRAY <<< "$FILTER_SERIES"
|
|
while IFS= read -r line; do
|
|
local series
|
|
series=$(echo "$line" | awk '{print $1}')
|
|
for f in "${FILTER_ARRAY[@]}"; do
|
|
if [[ "$series" == "$f" ]]; then
|
|
filtered+="${line}"$'\n'
|
|
fi
|
|
done
|
|
done <<< "$groups"
|
|
groups="$filtered"
|
|
total_groups=$(echo -n "$groups" | grep -c '^' || echo 0)
|
|
log "Filtered to ${total_groups} series: ${FILTER_SERIES}"
|
|
fi
|
|
|
|
if $DRY_RUN; then
|
|
warn "DRY RUN - would process these versions:"
|
|
echo "$groups" | while IFS= read -r line; do
|
|
[[ -z "$line" ]] && continue
|
|
local series version
|
|
series=$(echo "$line" | awk '{print $1}')
|
|
version=$(echo "$line" | awk '{print $2}')
|
|
echo " v${series}.x -> ${version}"
|
|
done
|
|
exit 0
|
|
fi
|
|
|
|
local processed=0
|
|
local failed=0
|
|
|
|
while IFS= read -r line; do
|
|
[[ -z "$line" ]] && continue
|
|
|
|
local series version
|
|
series=$(echo "$line" | awk '{print $1}')
|
|
version=$(echo "$line" | awk '{print $2}')
|
|
|
|
echo ""
|
|
log "Processing v${series}.x (latest: ${version})"
|
|
|
|
local ver_dir="${OUTPUT_BASE}/v${series}.x"
|
|
local source_dir="${ver_dir}/source"
|
|
local rvf_file="${ver_dir}/claude-code-v${series}.rvf"
|
|
local extract_dir="${TMP_DIR}/extract-${version}"
|
|
|
|
mkdir -p "$ver_dir" "$source_dir"
|
|
|
|
# Step 1: Download
|
|
if ! download_version "$version" "$extract_dir"; then
|
|
warn " Skipping ${version} (download failed)"
|
|
((failed++)) || true
|
|
continue
|
|
fi
|
|
|
|
local cli_path="${extract_dir}/cli.js"
|
|
if [[ ! -f "$cli_path" ]]; then
|
|
warn " No CLI bundle found for ${version}"
|
|
((failed++)) || true
|
|
continue
|
|
fi
|
|
|
|
# Step 2: Split into modules
|
|
if ! split_modules "$cli_path" "$source_dir"; then
|
|
warn " Module splitting failed for ${version}"
|
|
fi
|
|
|
|
# Step 3: Build binary RVF container
|
|
if build_rvf "$source_dir" "$rvf_file" "$version" "$series"; then
|
|
log " RVF container created: $(basename "$rvf_file")"
|
|
else
|
|
warn " RVF creation failed for ${version}"
|
|
# Create a fallback TODO note
|
|
cat > "${ver_dir}/TODO-rvf.md" <<EOF
|
|
# TODO: Create RVF Container
|
|
|
|
Version: ${version}
|
|
Series: v${series}.x
|
|
Error: RVF binary creation failed
|
|
|
|
The source modules have been extracted to \`source/\` but the binary
|
|
RVF container could not be created. This typically means the
|
|
\`@ruvector/rvf-node\` native backend is not available.
|
|
|
|
To create the container manually:
|
|
|
|
\`\`\`bash
|
|
node scripts/lib/rvf-builder.mjs source/ claude-code-v${series}.rvf \\
|
|
--meta version=${version} --meta series=${series}
|
|
\`\`\`
|
|
EOF
|
|
fi
|
|
|
|
# Step 4: Generate README
|
|
generate_readme "$ver_dir" "$series" "$version" "$rvf_file"
|
|
|
|
# Clean up extracted tarball content
|
|
rm -rf "$extract_dir"
|
|
|
|
((processed++)) || true
|
|
log " Done (${processed}/${total_groups})"
|
|
done <<< "$groups"
|
|
|
|
# Generate index
|
|
echo ""
|
|
log "Generating corpus index..."
|
|
|
|
# Rebuild index entries by scanning output dirs
|
|
local final_entries=()
|
|
for d in "${OUTPUT_BASE}"/v*.x; do
|
|
[[ -d "$d" ]] || continue
|
|
local dir_name
|
|
dir_name=$(basename "$d")
|
|
local series_name="${dir_name#v}"
|
|
series_name="${series_name%.x}"
|
|
|
|
local manifest="${d}/claude-code-v${series_name}.rvf.manifest.json"
|
|
if [[ -f "$manifest" ]]; then
|
|
local row
|
|
row=$(node -e "
|
|
const m=JSON.parse(require('fs').readFileSync('$manifest','utf-8'));
|
|
const s=m.source||{};
|
|
const met=s.metrics||{};
|
|
const bundle=(met.bundleSizeBytes/1024/1024).toFixed(1)+'MB';
|
|
const rvfSize=(m.fileSizeBytes/1024).toFixed(1)+'KB';
|
|
console.log('| ${series_name} | '+s.version+' | '+bundle+' | '+rvfSize+' | '+m.totalVectors+' | \`'+m.fileId.slice(0,12)+'...\` |');
|
|
" 2>/dev/null || echo "| ${series_name} | ? | ? | ? | ? | ? |")
|
|
final_entries+=("$row")
|
|
else
|
|
final_entries+=("| ${series_name} | ? | ? | N/A | N/A | N/A |")
|
|
fi
|
|
done
|
|
|
|
generate_index "$OUTPUT_BASE" "${final_entries[@]}"
|
|
|
|
echo ""
|
|
echo -e "${BOLD}Corpus build complete.${NC}"
|
|
log "Output: ${OUTPUT_BASE}/"
|
|
log "Versions processed: ${processed:-0}"
|
|
if [[ ${failed:-0} -gt 0 ]]; then
|
|
warn "Versions failed: ${failed}"
|
|
fi
|
|
}
|
|
|
|
main "$@"
|