#!/usr/bin/env bash # claude-code-rvf-corpus.sh - Build binary RVF containers for every major # Claude Code CLI release. # # Downloads the latest patch of each major.minor series from npm, extracts # the CLI bundle, splits into modules, and creates a binary RVF container # with vector embeddings and witness chains. # # Usage: # ./scripts/claude-code-rvf-corpus.sh [--dry-run] [--series 0.2,1.0,2.0,2.1] # # Output: docs/research/claude-code-rvsource/versions// # - claude-code-vX.Y.rvf Binary RVF container # - claude-code-vX.Y.rvf.manifest.json Container manifest # - source/ Extracted JS modules # - README.md Version metadata set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" OUTPUT_BASE="${ROOT_DIR}/docs/research/claude-code-rvsource/versions" TMP_DIR="/tmp/cc-rvf-corpus-$$" DRY_RUN=false FILTER_SERIES="" # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' CYAN='\033[0;36m' BOLD='\033[1m' NC='\033[0m' log() { echo -e "${GREEN}[+]${NC} $*"; } info() { echo -e "${CYAN}[*]${NC} $*"; } warn() { echo -e "${YELLOW}[!]${NC} $*"; } err() { echo -e "${RED}[-]${NC} $*" >&2; } cleanup() { rm -rf "$TMP_DIR" } trap cleanup EXIT # Parse arguments while [[ $# -gt 0 ]]; do case "$1" in --dry-run) DRY_RUN=true; shift ;; --series) FILTER_SERIES="$2"; shift 2 ;; --help|-h) echo "Usage: $0 [--dry-run] [--series 0.2,1.0,2.0,2.1]" exit 0 ;; *) err "Unknown argument: $1"; exit 1 ;; esac done # Fetch all versions from npm and group by major.minor get_version_groups() { log "Fetching Claude Code versions from npm..." >&2 local versions_json versions_json=$(npm view @anthropic-ai/claude-code versions --json 2>/dev/null) # Use node to group versions and pick latest patch per major.minor node -e " const versions = $versions_json; const groups = {}; for (const v of versions) { const parts = v.split('.'); const key = parts[0] + '.' + parts[1]; const patch = parseInt(parts[2], 10); if (!groups[key] || patch > groups[key].patch) { groups[key] = { version: v, patch, key }; } } // Sort by semver const sorted = Object.values(groups).sort((a, b) => { const [aMaj, aMin] = a.key.split('.').map(Number); const [bMaj, bMin] = b.key.split('.').map(Number); return aMaj !== bMaj ? aMaj - bMaj : aMin - bMin; }); for (const g of sorted) { console.log(g.key + ' ' + g.version); } " } # Download and extract a specific version download_version() { local version="$1" local dest_dir="$2" mkdir -p "$dest_dir" info " Downloading @anthropic-ai/claude-code@${version}..." local tgz_dir="${TMP_DIR}/tarballs" mkdir -p "$tgz_dir" npm pack "@anthropic-ai/claude-code@${version}" --pack-destination "$tgz_dir" \ >/dev/null 2>&1 # Find the tarball (naming varies between npm versions) local tgz tgz=$(ls "$tgz_dir"/anthropic-ai-claude-code-*.tgz 2>/dev/null | head -1) if [[ -z "$tgz" ]]; then err " Failed to download version ${version}" return 1 fi # Try to extract cli.js, then cli.mjs (don't list the tarball, just try) tar xf "$tgz" -C "$dest_dir" --strip-components=1 package/cli.js 2>/dev/null || true tar xf "$tgz" -C "$dest_dir" --strip-components=1 package/cli.mjs 2>/dev/null || true tar xf "$tgz" -C "$dest_dir" --strip-components=1 package/package.json 2>/dev/null || true # Rename cli.mjs -> cli.js for consistency if [[ -f "${dest_dir}/cli.mjs" ]] && [[ ! -f "${dest_dir}/cli.js" ]]; then mv "${dest_dir}/cli.mjs" "${dest_dir}/cli.js" fi if [[ ! -f "${dest_dir}/cli.js" ]]; then warn " No cli.js or cli.mjs found in ${version}" return 1 fi rm -f "$tgz" local size size=$(du -sh "${dest_dir}/cli.js" 2>/dev/null | cut -f1) info " Extracted cli.js (${size})" return 0 } # Split a CLI bundle into modules split_modules() { local cli_path="$1" local source_dir="$2" info " Splitting into modules..." node "${SCRIPT_DIR}/lib/module-splitter.mjs" "$cli_path" "$source_dir" 2>/dev/null } # Build a binary RVF container build_rvf() { local source_dir="$1" local rvf_path="$2" local version="$3" local series="$4" info " Building binary RVF container..." node "${SCRIPT_DIR}/lib/rvf-builder.mjs" \ "$source_dir" "$rvf_path" \ --meta "version=${version}" \ --meta "series=${series}" \ --meta "package=@anthropic-ai/claude-code" \ --meta "corpus=claude-code-rvsource" \ 2>/dev/null } # Generate a README for a version directory generate_readme() { local ver_dir="$1" local series="$2" local version="$3" local rvf_file="$4" local metrics_file="${ver_dir}/source/metrics.json" local manifest_file="${rvf_file}.manifest.json" # Read metrics local bundle_size="unknown" local classes="?" local functions="?" local modules_count="?" if [[ -f "$metrics_file" ]]; then bundle_size=$(node -e "const m=JSON.parse(require('fs').readFileSync('$metrics_file','utf-8')); console.log((m.sizeBytes/1024/1024).toFixed(1)+'MB')") classes=$(node -e "const m=JSON.parse(require('fs').readFileSync('$metrics_file','utf-8')); console.log(m.classes)") functions=$(node -e "const m=JSON.parse(require('fs').readFileSync('$metrics_file','utf-8')); console.log(m.functions)") modules_count=$(node -e "const m=JSON.parse(require('fs').readFileSync('$metrics_file','utf-8')); console.log(Object.keys(m.modules||{}).length)") fi local rvf_size="N/A" local rvf_vectors="N/A" local rvf_id="N/A" if [[ -f "$manifest_file" ]]; then rvf_size=$(node -e "const m=JSON.parse(require('fs').readFileSync('$manifest_file','utf-8')); console.log((m.fileSizeBytes/1024).toFixed(1)+'KB')") rvf_vectors=$(node -e "const m=JSON.parse(require('fs').readFileSync('$manifest_file','utf-8')); console.log(m.totalVectors)") rvf_id=$(node -e "const m=JSON.parse(require('fs').readFileSync('$manifest_file','utf-8')); console.log(m.fileId)") fi cat > "${ver_dir}/README.md" < "${base_dir}/README.md" <<'INDEXHEADER' # Claude Code RVF Corpus Binary RVF containers for every major Claude Code CLI release, with HNSW-indexed vector embeddings and witness chains for provenance. ## Versions | Series | Version | Bundle | RVF Size | Vectors | File ID | |--------|---------|--------|----------|---------|---------| INDEXHEADER for entry in "${entries[@]}"; do echo "$entry" >> "${base_dir}/README.md" done cat >> "${base_dir}/README.md" <<'INDEXFOOTER' ## How to Use ```bash # Build the corpus ./scripts/claude-code-rvf-corpus.sh # Build only specific series ./scripts/claude-code-rvf-corpus.sh --series 2.0,2.1 ``` ## Format Each version directory contains: - A binary `.rvf` container (128-dim cosine-distance HNSW index) - A `.manifest.json` sidecar with vector-to-fragment mapping - Extracted JavaScript modules in `source/` Generated by `scripts/claude-code-rvf-corpus.sh` using `@ruvector/rvf-node`. INDEXFOOTER } # ----------------------------------------------------------------------- # Main # ----------------------------------------------------------------------- main() { echo -e "${BOLD}Claude Code RVF Corpus Builder${NC}" echo -e "${BOLD}==============================${NC}" echo "" mkdir -p "$TMP_DIR" "$OUTPUT_BASE" # Get version groups local groups groups=$(get_version_groups) if [[ -z "$groups" ]]; then err "No versions found on npm" exit 1 fi local total_groups total_groups=$(echo "$groups" | wc -l) log "Found ${total_groups} major.minor series" # Apply filter if specified if [[ -n "$FILTER_SERIES" ]]; then local filtered="" IFS=',' read -ra FILTER_ARRAY <<< "$FILTER_SERIES" while IFS= read -r line; do local series series=$(echo "$line" | awk '{print $1}') for f in "${FILTER_ARRAY[@]}"; do if [[ "$series" == "$f" ]]; then filtered+="${line}"$'\n' fi done done <<< "$groups" groups="$filtered" total_groups=$(echo -n "$groups" | grep -c '^' || echo 0) log "Filtered to ${total_groups} series: ${FILTER_SERIES}" fi if $DRY_RUN; then warn "DRY RUN - would process these versions:" echo "$groups" | while IFS= read -r line; do [[ -z "$line" ]] && continue local series version series=$(echo "$line" | awk '{print $1}') version=$(echo "$line" | awk '{print $2}') echo " v${series}.x -> ${version}" done exit 0 fi local processed=0 local failed=0 while IFS= read -r line; do [[ -z "$line" ]] && continue local series version series=$(echo "$line" | awk '{print $1}') version=$(echo "$line" | awk '{print $2}') echo "" log "Processing v${series}.x (latest: ${version})" local ver_dir="${OUTPUT_BASE}/v${series}.x" local source_dir="${ver_dir}/source" local rvf_file="${ver_dir}/claude-code-v${series}.rvf" local extract_dir="${TMP_DIR}/extract-${version}" mkdir -p "$ver_dir" "$source_dir" # Step 1: Download if ! download_version "$version" "$extract_dir"; then warn " Skipping ${version} (download failed)" ((failed++)) || true continue fi local cli_path="${extract_dir}/cli.js" if [[ ! -f "$cli_path" ]]; then warn " No CLI bundle found for ${version}" ((failed++)) || true continue fi # Step 2: Split into modules if ! split_modules "$cli_path" "$source_dir"; then warn " Module splitting failed for ${version}" fi # Step 3: Build binary RVF container if build_rvf "$source_dir" "$rvf_file" "$version" "$series"; then log " RVF container created: $(basename "$rvf_file")" else warn " RVF creation failed for ${version}" # Create a fallback TODO note cat > "${ver_dir}/TODO-rvf.md" </dev/null || echo "| ${series_name} | ? | ? | ? | ? | ? |") final_entries+=("$row") else final_entries+=("| ${series_name} | ? | ? | N/A | N/A | N/A |") fi done generate_index "$OUTPUT_BASE" "${final_entries[@]}" echo "" echo -e "${BOLD}Corpus build complete.${NC}" log "Output: ${OUTPUT_BASE}/" log "Versions processed: ${processed:-0}" if [[ ${failed:-0} -gt 0 ]]; then warn "Versions failed: ${failed}" fi } main "$@"