ruvector/scripts/benchmark_ruvector.sql
rUv 814f595995 feat(studio): Add complete RuVector Studio application
Major additions:
- Complete Next.js studio application with 1600+ components
- Docker support (Dockerfile.combined, docker-compose.yml)
- GCP deployment documentation and benchmarks
- SQL benchmark scripts for performance testing
- Sentry integration for monitoring
- Comprehensive test suite and mocks

Studio features:
- Dashboard and admin interfaces
- Data visualization components
- Authentication and user management
- API integration with RuVector backend
- Static data and public assets

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-06 23:04:48 +00:00

281 lines
9.1 KiB
SQL

-- RuVector Comprehensive Benchmark Script
-- Tests all major RuVector extension capabilities with real data
\timing on
\echo '=============================================='
\echo 'RuVector Benchmark Suite v1.0'
\echo '=============================================='
-- Verify RuVector extension
\echo '\n=== 1. VERIFY RUVECTOR EXTENSION ==='
SELECT ruvector_version() AS version;
SELECT COUNT(*) AS total_functions FROM pg_proc WHERE proname LIKE 'ruvector_%';
-- Create benchmark schema
\echo '\n=== 2. CREATE BENCHMARK SCHEMA ==='
DROP TABLE IF EXISTS benchmark_documents CASCADE;
DROP TABLE IF EXISTS benchmark_agents CASCADE;
DROP TABLE IF EXISTS benchmark_queries CASCADE;
-- Document embeddings table (1536-dim for OpenAI compatibility)
CREATE TABLE benchmark_documents (
id SERIAL PRIMARY KEY,
title TEXT NOT NULL,
category TEXT NOT NULL,
embedding FLOAT4[] NOT NULL,
created_at TIMESTAMP DEFAULT NOW()
);
-- Agent definitions for routing
CREATE TABLE benchmark_agents (
id SERIAL PRIMARY KEY,
name TEXT NOT NULL,
description TEXT NOT NULL,
embedding FLOAT4[] NOT NULL,
llm_provider TEXT NOT NULL
);
-- Query log for GNN learning
CREATE TABLE benchmark_queries (
id SERIAL PRIMARY KEY,
query_embedding FLOAT4[] NOT NULL,
result_ids INTEGER[],
user_feedback FLOAT4 DEFAULT 0.0,
query_time_ms FLOAT4,
created_at TIMESTAMP DEFAULT NOW()
);
\echo 'Tables created successfully'
-- Generate realistic embeddings using RuVector functions
\echo '\n=== 3. GENERATE TEST DATA (10,000 documents) ==='
-- Insert 10,000 documents with normalized random embeddings
-- Using categories: tech, science, business, health, entertainment
INSERT INTO benchmark_documents (title, category, embedding)
SELECT
'Document ' || i AS title,
CASE (i % 5)
WHEN 0 THEN 'tech'
WHEN 1 THEN 'science'
WHEN 2 THEN 'business'
WHEN 3 THEN 'health'
ELSE 'entertainment'
END AS category,
-- Generate 1536-dim normalized random vectors
(SELECT array_agg(
-- Normalize: divide by sqrt(1536) ≈ 39.2
(random() - 0.5) / 19.6
) FROM generate_series(1, 1536)) AS embedding
FROM generate_series(1, 10000) AS i;
SELECT COUNT(*) AS document_count FROM benchmark_documents;
-- Insert agents for routing benchmark
\echo '\n=== 4. CREATE AGENT DEFINITIONS ==='
INSERT INTO benchmark_agents (name, description, llm_provider, embedding)
VALUES
('CodeAssistant', 'Helps with programming and code review', 'Claude 4',
(SELECT array_agg((random() - 0.5) / 19.6) FROM generate_series(1, 1536))),
('ResearchAgent', 'Analyzes scientific papers and data', 'Gemini 2.0',
(SELECT array_agg((random() - 0.5) / 19.6) FROM generate_series(1, 1536))),
('BusinessAnalyst', 'Financial analysis and market research', 'Claude 4',
(SELECT array_agg((random() - 0.5) / 19.6) FROM generate_series(1, 1536))),
('HealthAdvisor', 'Medical information and wellness tips', 'Llama 4',
(SELECT array_agg((random() - 0.5) / 19.6) FROM generate_series(1, 1536))),
('CreativeWriter', 'Content creation and storytelling', 'Gemini 2.0',
(SELECT array_agg((random() - 0.5) / 19.6) FROM generate_series(1, 1536)));
SELECT name, llm_provider FROM benchmark_agents;
-- Benchmark 1: Basic vector operations
\echo '\n=== 5. BENCHMARK: BASIC VECTOR OPERATIONS ==='
-- Test ruvector_dims
SELECT ruvector_dims(embedding::ruvector) AS dims
FROM benchmark_documents LIMIT 1;
-- Test cosine distance calculation (100 pairs)
\echo 'Cosine distance (100 pairs):'
SELECT AVG(ruvector_cosine_distance(
d1.embedding::ruvector,
d2.embedding::ruvector
)) AS avg_distance
FROM benchmark_documents d1
CROSS JOIN benchmark_documents d2
WHERE d1.id <= 10 AND d2.id BETWEEN 11 AND 20;
-- Benchmark 2: HNSW Index Creation
\echo '\n=== 6. BENCHMARK: HNSW INDEX CREATION ==='
-- Create HNSW index with custom parameters
\echo 'Creating HNSW index (M=16, ef_construction=200)...'
CREATE INDEX idx_documents_hnsw ON benchmark_documents
USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 200);
-- Check index size
SELECT pg_size_pretty(pg_relation_size('idx_documents_hnsw')) AS index_size;
-- Benchmark 3: Vector Search Performance
\echo '\n=== 7. BENCHMARK: VECTOR SEARCH (k-NN) ==='
-- Generate query vector
\echo 'Query 1: Top 10 nearest neighbors'
EXPLAIN ANALYZE
SELECT id, title, category,
embedding <-> (SELECT embedding FROM benchmark_documents WHERE id = 1)::ruvector AS distance
FROM benchmark_documents
ORDER BY distance
LIMIT 10;
-- Benchmark 4: Category-filtered search
\echo '\nQuery 2: Top 10 in category "tech"'
EXPLAIN ANALYZE
SELECT id, title,
embedding <-> (SELECT embedding FROM benchmark_documents WHERE id = 100)::ruvector AS distance
FROM benchmark_documents
WHERE category = 'tech'
ORDER BY distance
LIMIT 10;
-- Benchmark 5: Batch similarity search
\echo '\n=== 8. BENCHMARK: BATCH SIMILARITY SEARCH (100 queries) ==='
-- Run 100 similarity searches
SELECT COUNT(*) AS queries_executed,
AVG(distance) AS avg_distance,
MIN(distance) AS min_distance,
MAX(distance) AS max_distance
FROM (
SELECT query_id,
(SELECT embedding <-> d.embedding::ruvector
FROM benchmark_documents d
ORDER BY embedding <-> d.embedding
LIMIT 1) AS distance
FROM (
SELECT id AS query_id, embedding
FROM benchmark_documents
WHERE id <= 100
) queries
) results;
-- Benchmark 6: GNN Graph Operations
\echo '\n=== 9. BENCHMARK: GNN GRAPH OPERATIONS ==='
-- Create similarity graph
\echo 'Creating document similarity graph...'
SELECT ruvector_create_graph('doc_graph');
-- Add nodes (first 100 documents)
\echo 'Adding 100 nodes...'
SELECT COUNT(*) AS nodes_added FROM (
SELECT ruvector_add_node('doc_graph', id::text, embedding::ruvector)
FROM benchmark_documents
WHERE id <= 100
) t;
-- Add edges based on similarity
\echo 'Adding edges (similarity > 0.8)...'
SELECT COUNT(*) AS edges_added FROM (
SELECT ruvector_add_edge(
'doc_graph',
d1.id::text,
d2.id::text,
1.0 - ruvector_cosine_distance(d1.embedding::ruvector, d2.embedding::ruvector)
)
FROM benchmark_documents d1
CROSS JOIN benchmark_documents d2
WHERE d1.id < d2.id
AND d1.id <= 20 AND d2.id <= 20
AND ruvector_cosine_distance(d1.embedding::ruvector, d2.embedding::ruvector) < 0.2
) t;
-- Benchmark 7: Agent Routing
\echo '\n=== 10. BENCHMARK: AGENT ROUTING (ruvLLM) ==='
-- Route query to best agent
\echo 'Routing 10 queries to agents...'
SELECT
q.id AS query_doc,
a.name AS best_agent,
a.llm_provider,
ruvector_cosine_distance(q.embedding::ruvector, a.embedding::ruvector) AS distance
FROM benchmark_documents q
CROSS JOIN LATERAL (
SELECT name, llm_provider, embedding
FROM benchmark_agents
ORDER BY q.embedding <-> embedding::ruvector
LIMIT 1
) a
WHERE q.id <= 10;
-- Register agents with ruvLLM
\echo 'Registering agents with ruvLLM...'
SELECT ruvector_register_agent('CodeAssistant',
(SELECT embedding FROM benchmark_agents WHERE name = 'CodeAssistant')::ruvector,
'{"llm": "Claude 4", "specialization": "code"}'::jsonb);
SELECT ruvector_register_agent('ResearchAgent',
(SELECT embedding FROM benchmark_agents WHERE name = 'ResearchAgent')::ruvector,
'{"llm": "Gemini 2.0", "specialization": "research"}'::jsonb);
-- Route a query
\echo 'Test agent routing:'
SELECT ruvector_route(
(SELECT embedding FROM benchmark_documents WHERE id = 1)::ruvector
) AS routed_agent;
-- Benchmark 8: GNN Learning
\echo '\n=== 11. BENCHMARK: GNN LEARNING ==='
-- Enable learning mode
SELECT ruvector_enable_learning(true);
-- Record positive feedback for search results
\echo 'Recording feedback for GNN training...'
SELECT ruvector_record_feedback(
(SELECT embedding FROM benchmark_documents WHERE id = i)::ruvector,
ARRAY[i+1, i+2, i+3]::integer[],
0.9 -- High relevance score
)
FROM generate_series(1, 50) AS i;
-- Check learning stats
\echo 'Learning statistics:'
SELECT ruvector_learning_stats();
-- Benchmark 9: Memory and Performance Stats
\echo '\n=== 12. BENCHMARK: SYSTEM STATS ==='
SELECT ruvector_memory_stats();
-- Benchmark 10: Pattern Extraction
\echo '\n=== 13. BENCHMARK: PATTERN EXTRACTION ==='
\echo 'Extracting patterns from embeddings...'
SELECT ruvector_extract_patterns(embedding::ruvector) AS patterns
FROM benchmark_documents
WHERE id = 1;
-- Benchmark 11: Sparse Vector Operations
\echo '\n=== 14. BENCHMARK: SPARSE VECTORS ==='
\echo 'Dense to sparse conversion:'
SELECT ruvector_dense_to_sparse(embedding::ruvector, 0.01) AS sparse_dims
FROM benchmark_documents
WHERE id = 1;
-- Benchmark 12: Auto-tuning
\echo '\n=== 15. BENCHMARK: AUTO-TUNING ==='
SELECT ruvector_auto_tune();
-- Final Summary
\echo '\n=============================================='
\echo 'BENCHMARK SUMMARY'
\echo '=============================================='
SELECT
(SELECT COUNT(*) FROM benchmark_documents) AS total_documents,
(SELECT COUNT(*) FROM benchmark_agents) AS total_agents,
(SELECT pg_size_pretty(pg_table_size('benchmark_documents'))) AS table_size,
(SELECT pg_size_pretty(pg_relation_size('idx_documents_hnsw'))) AS index_size;
\echo '\nBenchmark complete!'
\timing off