feat(rvagent-learning): add multi-domain research discovery (PR #263 approach)

Incorporates research approaches from mcp-brain-server PR #263:
- Add DiscoveryDomain enum for multi-domain discovery
- Add ResearchClient for PubMed and arXiv integration
- Add DiscoveryConfig for configurable discovery parameters
- Update daily_cycle to fetch from external research sources
- Fetch from PubMed via NCBI E-utilities API
- Fetch from arXiv via Atom feed API

Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
Reuven 2026-03-16 22:53:57 -04:00
parent f76d1da3d0
commit 10eecdb4c9
5 changed files with 435 additions and 11 deletions

View file

@ -33,6 +33,9 @@ uuid = { version = "1.8", features = ["v4", "serde"] }
# HTTP client for API calls - pin to avoid edition2024 deps
reqwest = { version = "0.11", features = ["json", "rustls-tls"], default-features = false }
# URL encoding for API requests
urlencoding = "2.1"
# File watching for continuous mode
notify = { version = "6.1", optional = true }

View file

@ -33,6 +33,9 @@ uuid = { workspace = true, features = ["v4", "serde"] }
# HTTP client for API calls
reqwest = { version = "0.12", features = ["json", "rustls-tls"], default-features = false }
# URL encoding for API requests
urlencoding = "2.1"
# Google Cloud integration (via gcloud CLI)
# File watching for continuous mode

View file

@ -13,7 +13,7 @@
use rvagent_learning::{
DailyLearningLoop, SchedulerConfig,
discovery::{CodebaseScanner, PatternAnalyzer, DiscoveryLog},
discovery::{CodebaseScanner, PatternAnalyzer, DiscoveryLog, DiscoveryConfig, ResearchClient},
goap::{GoapPlanner, LearningGoal, LearningWorldState},
integration::PiRuvIoClient,
};
@ -97,22 +97,52 @@ async fn run_single_cycle(scan_dir: &str, dry_run: bool) -> anyhow::Result<()> {
println!();
let start = Instant::now();
let mut discoveries: Vec<DiscoveryLog> = Vec::new();
// Phase 1: Scan codebase
println!("📂 Phase 1: Scanning codebase...");
// Phase 1: Fetch from external research sources (PR #263 approach)
println!("🌐 Phase 1: Fetching external research sources...");
let mut research_config = DiscoveryConfig::default();
research_config.pubmed_queries = vec![
"transformer neural network".to_string(),
"large language model".to_string(),
"reinforcement learning".to_string(),
];
research_config.arxiv_categories = vec![
"cs.AI".to_string(),
"cs.LG".to_string(),
];
let research_client = ResearchClient::new(research_config);
let external_results = research_client.fetch_all_domains().await;
let mut external_count = 0;
for (domain, external_discoveries) in external_results {
println!(" {}: {} discoveries", domain, external_discoveries.len());
for ext in external_discoveries {
discoveries.push(ext.to_discovery_log());
external_count += 1;
}
}
println!(" Total external discoveries: {}", external_count);
// Phase 2: Scan codebase
println!();
println!("📂 Phase 2: Scanning codebase...");
let scanner = CodebaseScanner::new(scan_dir);
let files = scanner.scan().await?;
println!(" Found {} files to analyze", files.len());
// Phase 2: Analyze patterns
println!("🔍 Phase 2: Analyzing patterns...");
// Phase 3: Analyze patterns from codebase
println!("🔍 Phase 3: Analyzing patterns...");
let analyzer = PatternAnalyzer::new();
let file_contents: Vec<(String, String)> = files
.into_iter()
.map(|f| (f.path.to_string_lossy().to_string(), f.content))
.collect();
let discoveries = analyzer.analyze_files(&file_contents);
println!(" Discovered {} patterns", discoveries.len());
let codebase_discoveries = analyzer.analyze_files(&file_contents);
println!(" Discovered {} patterns from codebase", codebase_discoveries.len());
discoveries.extend(codebase_discoveries);
println!(" Total discoveries: {}", discoveries.len());
// Show discoveries
if !discoveries.is_empty() {
@ -129,9 +159,9 @@ async fn run_single_cycle(scan_dir: &str, dry_run: bool) -> anyhow::Result<()> {
}
}
// Phase 3: GOAP Planning
// Phase 4: GOAP Planning
println!();
println!("🧠 Phase 3: GOAP Planning...");
println!("🧠 Phase 4: GOAP Planning...");
let planner = GoapPlanner::new();
let mut state = LearningWorldState::default();
state.patterns_discovered = discoveries.len();
@ -144,7 +174,7 @@ async fn run_single_cycle(scan_dir: &str, dry_run: bool) -> anyhow::Result<()> {
println!(" - {} (cost: {:.1})", action.action, action.cost);
}
// Phase 4: Submit to π.ruv.io (if not dry run)
// Phase 5: Submit to π.ruv.io (if not dry run)
if !dry_run && !discoveries.is_empty() {
println!();
println!("☁️ Phase 4: Submitting to π.ruv.io...");
@ -179,7 +209,7 @@ async fn run_single_cycle(scan_dir: &str, dry_run: bool) -> anyhow::Result<()> {
}
} else if dry_run {
println!();
println!("☁️ Phase 4: Skipped (dry run mode)");
println!("☁️ Phase 5: Skipped (dry run mode)");
}
let duration = start.elapsed();

View file

@ -0,0 +1,385 @@
//! External research source integrations
//!
//! Incorporates multi-domain discovery approaches from mcp-brain-server (PR #263).
//! Fetches real-world data from open scientific APIs for the daily learning loop.
use super::{DiscoveryCategory, DiscoveryLog, QualityAssessment, ToolUsage, ToolType};
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use thiserror::Error;
/// Discovery domains (aligned with mcp-brain-server trainer.rs)
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
#[serde(rename_all = "snake_case")]
pub enum DiscoveryDomain {
/// Space science - NASA, ESA, arXiv astro-ph
SpaceScience,
/// Earth science - climate, geology, oceanography
EarthScience,
/// Academic research - arXiv, SSRN, preprints
AcademicResearch,
/// Economics and finance - FRED, BLS, market data
EconomicsFinance,
/// Medical and genomics - PubMed, NCBI
MedicalGenomics,
/// Materials and physics - arXiv cond-mat, materials databases
MaterialsPhysics,
/// Local codebase scanning
Codebase,
}
impl std::fmt::Display for DiscoveryDomain {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::SpaceScience => write!(f, "space-science"),
Self::EarthScience => write!(f, "earth-science"),
Self::AcademicResearch => write!(f, "academic-research"),
Self::EconomicsFinance => write!(f, "economics-finance"),
Self::MedicalGenomics => write!(f, "medical-genomics"),
Self::MaterialsPhysics => write!(f, "materials-physics"),
Self::Codebase => write!(f, "codebase"),
}
}
}
/// Configuration for multi-domain discovery (from PR #263 TrainerConfig)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DiscoveryConfig {
/// Minimum confidence to submit a discovery
pub min_confidence: f64,
/// Maximum discoveries per cycle
pub max_per_cycle: usize,
/// Duplicate detection threshold (cosine similarity)
pub duplicate_threshold: f64,
/// Active discovery domains
pub active_domains: Vec<DiscoveryDomain>,
/// Whether to trigger SONA learning after ingestion
pub trigger_sona: bool,
/// API request delay (ms) for rate limiting
pub api_delay_ms: u64,
/// PubMed search queries
pub pubmed_queries: Vec<String>,
/// arXiv categories to search
pub arxiv_categories: Vec<String>,
}
impl Default for DiscoveryConfig {
fn default() -> Self {
Self {
min_confidence: 0.70,
max_per_cycle: 50,
duplicate_threshold: 0.95,
active_domains: vec![
DiscoveryDomain::MedicalGenomics,
DiscoveryDomain::SpaceScience,
DiscoveryDomain::AcademicResearch,
DiscoveryDomain::Codebase,
],
trigger_sona: true,
api_delay_ms: 500,
pubmed_queries: vec![
"machine learning".to_string(),
"neural network optimization".to_string(),
"transformer architecture".to_string(),
],
arxiv_categories: vec![
"cs.AI".to_string(),
"cs.LG".to_string(),
"cs.CL".to_string(),
],
}
}
}
/// Errors from external source fetching
#[derive(Debug, Error)]
pub enum ExternalSourceError {
#[error("HTTP request failed: {0}")]
HttpError(#[from] reqwest::Error),
#[error("JSON parse failed: {0}")]
JsonError(#[from] serde_json::Error),
#[error("XML parse failed: {0}")]
XmlError(String),
#[error("Rate limited, retry after {0}ms")]
RateLimited(u64),
#[error("API returned error: {0}")]
ApiError(String),
}
/// A discovery from an external source
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExternalDiscovery {
pub id: String,
pub domain: DiscoveryDomain,
pub title: String,
pub content: String,
pub source_url: String,
pub authors: Vec<String>,
pub tags: Vec<String>,
pub confidence: f64,
pub timestamp: DateTime<Utc>,
}
impl ExternalDiscovery {
/// Convert to DiscoveryLog for unified processing
pub fn to_discovery_log(&self) -> DiscoveryLog {
let category = match self.domain {
DiscoveryDomain::SpaceScience => DiscoveryCategory::Other,
DiscoveryDomain::MedicalGenomics => DiscoveryCategory::Other,
DiscoveryDomain::AcademicResearch => DiscoveryCategory::Other,
DiscoveryDomain::EconomicsFinance => DiscoveryCategory::Other,
DiscoveryDomain::EarthScience => DiscoveryCategory::Other,
DiscoveryDomain::MaterialsPhysics => DiscoveryCategory::Other,
DiscoveryDomain::Codebase => DiscoveryCategory::Architecture,
};
let mut log = DiscoveryLog::new(category, &self.title, &self.content)
.with_tags(self.tags.clone())
.with_tool(ToolUsage {
tool_type: ToolType::Custom,
tool_name: format!("{} API", self.domain),
usage_description: format!("Fetched from {}", self.source_url),
duration_ms: None,
input_summary: None,
output_summary: Some(format!("{} authors", self.authors.len())),
});
log.quality = QualityAssessment {
novelty: self.confidence,
usefulness: 0.7,
clarity: 0.8,
correctness: self.confidence,
generalizability: 0.6,
composite: self.confidence * 0.8,
confidence: self.confidence,
method: super::AssessmentMethod::Heuristic,
};
log
}
}
/// Multi-domain research client
pub struct ResearchClient {
http: reqwest::Client,
config: DiscoveryConfig,
}
impl ResearchClient {
pub fn new(config: DiscoveryConfig) -> Self {
let http = reqwest::Client::builder()
.user_agent("rvagent-learning/1.0 (https://pi.ruv.io; benevolent-discovery)")
.timeout(std::time::Duration::from_secs(30))
.build()
.expect("HTTP client");
Self { http, config }
}
/// Fetch discoveries from PubMed (NCBI E-utilities)
pub async fn fetch_pubmed(&self, query: &str, max_results: usize) -> Result<Vec<ExternalDiscovery>, ExternalSourceError> {
// Rate limit
tokio::time::sleep(std::time::Duration::from_millis(self.config.api_delay_ms)).await;
// Search for PMIDs
let search_url = format!(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={}&retmax={}&retmode=json&sort=date",
urlencoding::encode(query),
max_results
);
let resp: serde_json::Value = self.http.get(&search_url).send().await?.json().await?;
let pmids: Vec<String> = resp["esearchresult"]["idlist"]
.as_array()
.map(|arr| arr.iter().filter_map(|v| v.as_str().map(String::from)).collect())
.unwrap_or_default();
if pmids.is_empty() {
return Ok(vec![]);
}
// Rate limit before fetch
tokio::time::sleep(std::time::Duration::from_millis(self.config.api_delay_ms)).await;
// Fetch article summaries
let fetch_url = format!(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={}&retmode=json",
pmids.join(",")
);
let summary: serde_json::Value = self.http.get(&fetch_url).send().await?.json().await?;
let mut discoveries = Vec::new();
if let Some(result) = summary.get("result") {
for pmid in &pmids {
if let Some(article) = result.get(pmid) {
let title = article["title"].as_str().unwrap_or("Untitled").to_string();
let source = article["source"].as_str().unwrap_or("").to_string();
// Get authors
let authors: Vec<String> = article["authors"]
.as_array()
.map(|arr| {
arr.iter()
.filter_map(|a| a["name"].as_str().map(String::from))
.collect()
})
.unwrap_or_default();
discoveries.push(ExternalDiscovery {
id: pmid.clone(),
domain: DiscoveryDomain::MedicalGenomics,
title: title.clone(),
content: format!("Published in {}. {}", source, title),
source_url: format!("https://pubmed.ncbi.nlm.nih.gov/{}/", pmid),
authors,
tags: vec!["pubmed".to_string(), "medical".to_string()],
confidence: 0.85,
timestamp: Utc::now(),
});
}
}
}
Ok(discoveries)
}
/// Fetch discoveries from arXiv
pub async fn fetch_arxiv(&self, category: &str, max_results: usize) -> Result<Vec<ExternalDiscovery>, ExternalSourceError> {
// Rate limit
tokio::time::sleep(std::time::Duration::from_millis(self.config.api_delay_ms)).await;
let url = format!(
"https://export.arxiv.org/api/query?search_query=cat:{}&start=0&max_results={}&sortBy=lastUpdatedDate&sortOrder=descending",
category,
max_results
);
let resp = self.http.get(&url).send().await?.text().await?;
// Simple XML parsing for arXiv Atom feed
let mut discoveries = Vec::new();
for entry in resp.split("<entry>").skip(1) {
let title = extract_xml_tag(entry, "title").unwrap_or_default().trim().to_string();
let summary = extract_xml_tag(entry, "summary").unwrap_or_default().trim().to_string();
let id = extract_xml_tag(entry, "id").unwrap_or_default();
// Extract authors
let authors: Vec<String> = entry
.split("<author>")
.skip(1)
.filter_map(|a| extract_xml_tag(a, "name"))
.collect();
if !title.is_empty() {
discoveries.push(ExternalDiscovery {
id: id.clone(),
domain: DiscoveryDomain::AcademicResearch,
title,
content: summary,
source_url: id,
authors,
tags: vec!["arxiv".to_string(), category.to_string()],
confidence: 0.80,
timestamp: Utc::now(),
});
}
}
Ok(discoveries)
}
/// Fetch all discoveries from configured domains
pub async fn fetch_all_domains(&self) -> HashMap<DiscoveryDomain, Vec<ExternalDiscovery>> {
let mut results = HashMap::new();
for domain in &self.config.active_domains {
let discoveries = match domain {
DiscoveryDomain::MedicalGenomics => {
let mut all = Vec::new();
for query in &self.config.pubmed_queries {
match self.fetch_pubmed(query, 10).await {
Ok(d) => all.extend(d),
Err(e) => tracing::warn!("PubMed fetch failed for '{}': {}", query, e),
}
}
all
}
DiscoveryDomain::AcademicResearch => {
let mut all = Vec::new();
for cat in &self.config.arxiv_categories {
match self.fetch_arxiv(cat, 10).await {
Ok(d) => all.extend(d),
Err(e) => tracing::warn!("arXiv fetch failed for '{}': {}", cat, e),
}
}
all
}
_ => Vec::new(), // Other domains not yet implemented
};
results.insert(domain.clone(), discoveries);
}
results
}
}
/// Extract content between XML tags (simple helper)
fn extract_xml_tag(xml: &str, tag: &str) -> Option<String> {
let start_tag = format!("<{}", tag);
let end_tag = format!("</{}>", tag);
let start = xml.find(&start_tag)?;
let after_start = &xml[start..];
let content_start = after_start.find('>')? + 1;
let content = &after_start[content_start..];
let end = content.find(&end_tag)?;
Some(content[..end].to_string())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_discovery_domain_display() {
assert_eq!(DiscoveryDomain::MedicalGenomics.to_string(), "medical-genomics");
assert_eq!(DiscoveryDomain::SpaceScience.to_string(), "space-science");
}
#[test]
fn test_default_config() {
let config = DiscoveryConfig::default();
assert_eq!(config.min_confidence, 0.70);
assert!(config.active_domains.contains(&DiscoveryDomain::MedicalGenomics));
}
#[test]
fn test_external_to_discovery_log() {
let ext = ExternalDiscovery {
id: "123".to_string(),
domain: DiscoveryDomain::MedicalGenomics,
title: "Test Paper".to_string(),
content: "Abstract here".to_string(),
source_url: "https://pubmed.ncbi.nlm.nih.gov/123/".to_string(),
authors: vec!["Smith J".to_string()],
tags: vec!["test".to_string()],
confidence: 0.9,
timestamp: Utc::now(),
};
let log = ext.to_discovery_log();
assert_eq!(log.title, "Test Paper");
assert!(log.quality.composite > 0.5);
}
#[test]
fn test_xml_extraction() {
let xml = "<entry><title>Test Title</title><summary>Test Summary</summary></entry>";
assert_eq!(extract_xml_tag(xml, "title"), Some("Test Title".to_string()));
assert_eq!(extract_xml_tag(xml, "summary"), Some("Test Summary".to_string()));
}
}

View file

@ -4,12 +4,15 @@
//! - DiscoveryLog struct with tool and method attribution
//! - Scanner for codebase pattern discovery
//! - Quality assessment integration
//! - Multi-domain external source discovery (PR #263 approaches)
mod scanner;
mod analyzer;
mod external;
pub use scanner::CodebaseScanner;
pub use analyzer::PatternAnalyzer;
pub use external::{DiscoveryConfig, DiscoveryDomain, ExternalDiscovery, ExternalSourceError, ResearchClient};
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};