mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-23 21:25:02 +00:00
fix(security): patch command injection and SONA bugs, publish mincut-wasm
Security: - Fix #256: Add sanitizeShellArg() to MCP workers_create handler preventing shell command injection via name/preset/triggers params Bug fixes: - Fix #257: Add fallback parser in sona-wrapper.js for Rust debug format strings from SonaEngine.getStats() - Fix #258: Add force parameter to BackgroundLoop::run_cycle() so forceLearn() bypasses 100-trajectory minimum requirement Features: - Fix #254: Build and publish @ruvector/mincut-wasm@0.1.0 to npm - Add Wayback Machine fallback for Common Crawl CDX API Published: - @ruvector/mincut-wasm@0.1.0 - ruvector@0.2.13 Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
parent
ee15e5f178
commit
d35ea335b4
9 changed files with 371 additions and 181 deletions
323
crates/mcp-brain-server/Cargo.lock
generated
323
crates/mcp-brain-server/Cargo.lock
generated
|
|
@ -438,6 +438,16 @@ version = "0.9.6"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6"
|
||||
dependencies = [
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation-sys"
|
||||
version = "0.8.7"
|
||||
|
|
@ -751,6 +761,12 @@ dependencies = [
|
|||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
version = "2.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
|
||||
|
||||
[[package]]
|
||||
name = "fiat-crypto"
|
||||
version = "0.2.9"
|
||||
|
|
@ -775,6 +791,21 @@ version = "0.1.5"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
|
||||
dependencies = [
|
||||
"foreign-types-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types-shared"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
|
||||
|
||||
[[package]]
|
||||
name = "form_urlencoded"
|
||||
version = "1.2.2"
|
||||
|
|
@ -847,11 +878,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"js-sys",
|
||||
"libc",
|
||||
"r-efi",
|
||||
"wasip2",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -1012,20 +1041,19 @@ dependencies = [
|
|||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-rustls"
|
||||
version = "0.27.7"
|
||||
name = "hyper-tls"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
|
||||
checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
|
||||
dependencies = [
|
||||
"http",
|
||||
"bytes",
|
||||
"http-body-util",
|
||||
"hyper",
|
||||
"hyper-util",
|
||||
"rustls",
|
||||
"rustls-pki-types",
|
||||
"native-tls",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tokio-native-tls",
|
||||
"tower-service",
|
||||
"webpki-roots",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -1308,6 +1336,12 @@ dependencies = [
|
|||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
|
||||
|
||||
[[package]]
|
||||
name = "litemap"
|
||||
version = "0.8.1"
|
||||
|
|
@ -1329,12 +1363,6 @@ version = "0.4.29"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
|
||||
|
||||
[[package]]
|
||||
name = "lru-slab"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
|
||||
|
||||
[[package]]
|
||||
name = "mach2"
|
||||
version = "0.4.3"
|
||||
|
|
@ -1549,6 +1577,23 @@ dependencies = [
|
|||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "native-tls"
|
||||
version = "0.2.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"log",
|
||||
"openssl",
|
||||
"openssl-probe",
|
||||
"openssl-sys",
|
||||
"schannel",
|
||||
"security-framework",
|
||||
"security-framework-sys",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ndarray"
|
||||
version = "0.15.6"
|
||||
|
|
@ -1683,6 +1728,50 @@ version = "1.70.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.76"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "951c002c75e16ea2c65b8c7e4d3d51d5530d8dfa7d060b4776828c88cfb18ecf"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cfg-if",
|
||||
"foreign-types",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"openssl-macros",
|
||||
"openssl-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openssl-macros"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openssl-probe"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
|
||||
|
||||
[[package]]
|
||||
name = "openssl-sys"
|
||||
version = "0.9.112"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"pkg-config",
|
||||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "option-ext"
|
||||
version = "0.2.0"
|
||||
|
|
@ -1765,6 +1854,12 @@ dependencies = [
|
|||
"spki",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pkg-config"
|
||||
version = "0.3.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
|
||||
|
||||
[[package]]
|
||||
name = "portable-atomic"
|
||||
version = "1.13.1"
|
||||
|
|
@ -1837,61 +1932,6 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quinn"
|
||||
version = "0.11.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"cfg_aliases",
|
||||
"pin-project-lite",
|
||||
"quinn-proto",
|
||||
"quinn-udp",
|
||||
"rustc-hash",
|
||||
"rustls",
|
||||
"socket2",
|
||||
"thiserror 2.0.18",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"web-time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quinn-proto"
|
||||
version = "0.11.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"getrandom 0.3.4",
|
||||
"lru-slab",
|
||||
"rand 0.9.2",
|
||||
"ring",
|
||||
"rustc-hash",
|
||||
"rustls",
|
||||
"rustls-pki-types",
|
||||
"slab",
|
||||
"thiserror 2.0.18",
|
||||
"tinyvec",
|
||||
"tracing",
|
||||
"web-time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quinn-udp"
|
||||
version = "0.5.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd"
|
||||
dependencies = [
|
||||
"cfg_aliases",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"socket2",
|
||||
"tracing",
|
||||
"windows-sys 0.60.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.44"
|
||||
|
|
@ -2091,21 +2131,20 @@ dependencies = [
|
|||
"http-body",
|
||||
"http-body-util",
|
||||
"hyper",
|
||||
"hyper-rustls",
|
||||
"hyper-tls",
|
||||
"hyper-util",
|
||||
"js-sys",
|
||||
"log",
|
||||
"native-tls",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"quinn",
|
||||
"rustls",
|
||||
"rustls-pki-types",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_urlencoded",
|
||||
"sync_wrapper",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tokio-native-tls",
|
||||
"tower",
|
||||
"tower-http",
|
||||
"tower-service",
|
||||
|
|
@ -2113,21 +2152,6 @@ dependencies = [
|
|||
"wasm-bindgen",
|
||||
"wasm-bindgen-futures",
|
||||
"web-sys",
|
||||
"webpki-roots",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.17.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"cfg-if",
|
||||
"getrandom 0.2.17",
|
||||
"libc",
|
||||
"untrusted",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -2170,12 +2194,6 @@ dependencies = [
|
|||
"byteorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc-hash"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
|
||||
|
||||
[[package]]
|
||||
name = "rustc_version"
|
||||
version = "0.4.1"
|
||||
|
|
@ -2186,17 +2204,16 @@ dependencies = [
|
|||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls"
|
||||
version = "0.23.37"
|
||||
name = "rustix"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4"
|
||||
checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"ring",
|
||||
"rustls-pki-types",
|
||||
"rustls-webpki",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
"bitflags",
|
||||
"errno",
|
||||
"libc",
|
||||
"linux-raw-sys",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -2205,21 +2222,9 @@ version = "1.14.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd"
|
||||
dependencies = [
|
||||
"web-time",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-webpki"
|
||||
version = "0.103.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53"
|
||||
dependencies = [
|
||||
"ring",
|
||||
"rustls-pki-types",
|
||||
"untrusted",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustversion"
|
||||
version = "1.0.22"
|
||||
|
|
@ -2429,12 +2434,44 @@ dependencies = [
|
|||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "schannel"
|
||||
version = "0.1.29"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939"
|
||||
dependencies = [
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scopeguard"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
||||
|
||||
[[package]]
|
||||
name = "security-framework"
|
||||
version = "3.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"core-foundation",
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
"security-framework-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "security-framework-sys"
|
||||
version = "2.17.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3"
|
||||
dependencies = [
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "1.0.27"
|
||||
|
|
@ -2753,6 +2790,19 @@ dependencies = [
|
|||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.27.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
|
||||
dependencies = [
|
||||
"fastrand",
|
||||
"getrandom 0.3.4",
|
||||
"once_cell",
|
||||
"rustix",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "temporal-attractor-studio"
|
||||
version = "0.1.0"
|
||||
|
|
@ -2908,12 +2958,12 @@ dependencies = [
|
|||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-rustls"
|
||||
version = "0.26.4"
|
||||
name = "tokio-native-tls"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61"
|
||||
checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
|
||||
dependencies = [
|
||||
"rustls",
|
||||
"native-tls",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
|
|
@ -3062,12 +3112,6 @@ version = "0.2.6"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
|
||||
|
||||
[[package]]
|
||||
name = "untrusted"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
|
||||
|
||||
[[package]]
|
||||
name = "unty"
|
||||
version = "0.0.4"
|
||||
|
|
@ -3122,6 +3166,12 @@ version = "0.1.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
|
||||
|
||||
[[package]]
|
||||
name = "vcpkg"
|
||||
version = "0.2.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.5"
|
||||
|
|
@ -3291,25 +3341,6 @@ dependencies = [
|
|||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "web-time"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webpki-roots"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed"
|
||||
dependencies = [
|
||||
"rustls-pki-types",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wide"
|
||||
version = "0.7.33"
|
||||
|
|
|
|||
|
|
@ -703,22 +703,77 @@ impl CommonCrawlAdapter {
|
|||
/// Uses CDX cache (ADR-115) to avoid redundant API calls - 24h TTL.
|
||||
///
|
||||
/// NOTE: CDX API at index.commoncrawl.org has connectivity issues from Cloud Run.
|
||||
/// Falls back to sample records for demonstration purposes.
|
||||
/// Falls back to Internet Archive's Wayback CDX API which is accessible.
|
||||
pub async fn query_cdx(&self, query: &CdxQuery) -> Result<Vec<CdxRecord>, String> {
|
||||
let crawl = match &query.crawl_index {
|
||||
Some(c) => c.clone(),
|
||||
None => self.latest_crawl.read().await.clone(),
|
||||
};
|
||||
|
||||
// Try live CDX API first, fall back to samples if it fails
|
||||
// Try live Common Crawl CDX API first
|
||||
let live_result = self.query_cdx_live(&query, &crawl).await;
|
||||
if live_result.is_ok() {
|
||||
return live_result;
|
||||
}
|
||||
|
||||
// Fall back to sample records for demonstration
|
||||
tracing::warn!("CDX API unavailable, using sample records for demonstration");
|
||||
self.get_sample_cdx_records(&query.url_pattern, query.limit)
|
||||
// Fall back to Internet Archive's Wayback CDX (works from Cloud Run)
|
||||
tracing::warn!("Common Crawl CDX unavailable, falling back to Wayback CDX");
|
||||
self.query_wayback_cdx(&query.url_pattern, query.limit).await
|
||||
}
|
||||
|
||||
/// Query Internet Archive's Wayback CDX API (fallback when Common Crawl CDX is unreachable).
|
||||
/// Returns synthetic CdxRecords with filename set to "wayback:{timestamp}" for special handling.
|
||||
async fn query_wayback_cdx(&self, url_pattern: &str, limit: usize) -> Result<Vec<CdxRecord>, String> {
|
||||
// IA Wayback CDX API
|
||||
let url = format!(
|
||||
"https://web.archive.org/cdx/search/cdx?url={}&output=json&limit={}",
|
||||
urlencoding::encode(url_pattern),
|
||||
limit + 1 // +1 for header row
|
||||
);
|
||||
|
||||
let resp = self.http.get(&url)
|
||||
.header("Accept", "application/json")
|
||||
.send().await
|
||||
.map_err(|e| format!("Wayback CDX failed: {e}"))?;
|
||||
|
||||
if !resp.status().is_success() {
|
||||
return Err(format!("Wayback CDX returned status {}", resp.status()));
|
||||
}
|
||||
|
||||
let body = resp.text().await.map_err(|e| format!("Wayback body read failed: {e}"))?;
|
||||
|
||||
// Parse IA CDX JSON array format: [[headers...], [values...], ...]
|
||||
let rows: Vec<Vec<String>> = serde_json::from_str(&body)
|
||||
.map_err(|e| format!("Wayback CDX parse failed: {e}"))?;
|
||||
|
||||
// Skip header row, convert to CdxRecord
|
||||
let records: Vec<CdxRecord> = rows.iter().skip(1).take(limit).filter_map(|row| {
|
||||
if row.len() >= 7 {
|
||||
// IA CDX columns: urlkey, timestamp, original, mimetype, statuscode, digest, length
|
||||
Some(CdxRecord {
|
||||
url: row.get(2).cloned().unwrap_or_default(),
|
||||
timestamp: row.get(1).cloned().unwrap_or_default(),
|
||||
mime: row.get(3).cloned().unwrap_or_default(),
|
||||
status: row.get(4).cloned().unwrap_or_default(),
|
||||
filename: format!("wayback:{}", row.get(1).cloned().unwrap_or_default()), // Special marker
|
||||
offset: 0,
|
||||
length: row.get(6).and_then(|s| s.parse().ok()).unwrap_or(0),
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}).collect();
|
||||
|
||||
if records.is_empty() {
|
||||
return Err("No Wayback results found".into());
|
||||
}
|
||||
|
||||
// Mark URLs as seen
|
||||
for r in &records {
|
||||
self.seen_urls.insert(r.url.clone(), ());
|
||||
}
|
||||
|
||||
Ok(records)
|
||||
}
|
||||
|
||||
/// Get sample CDX records for demonstration when live API is unavailable.
|
||||
|
|
@ -878,25 +933,57 @@ impl CommonCrawlAdapter {
|
|||
Ok(records)
|
||||
}
|
||||
|
||||
/// Fetch a single page from Common Crawl via WARC range-GET.
|
||||
/// Fetch a single page from Common Crawl via WARC range-GET or Wayback Machine.
|
||||
pub async fn fetch_page(&self, record: &CdxRecord) -> Result<CrawlPage, String> {
|
||||
if record.filename.is_empty() || record.length == 0 {
|
||||
return Err("Invalid CDX record: missing filename or length".into());
|
||||
if record.filename.is_empty() {
|
||||
return Err("Invalid CDX record: missing filename".into());
|
||||
}
|
||||
let warc_url = format!("{}/{}", self.data_base, record.filename);
|
||||
let range = format!("bytes={}-{}", record.offset, record.offset + record.length - 1);
|
||||
|
||||
self.stats.pages_fetched.fetch_add(1, Ordering::Relaxed);
|
||||
let resp = self.http.get(&warc_url)
|
||||
.header("Range", &range)
|
||||
.send().await.map_err(|e| format!("WARC fetch failed for {}: {e}", record.url))?;
|
||||
if !resp.status().is_success() && resp.status().as_u16() != 206 {
|
||||
return Err(format!("WARC returned status {}", resp.status()));
|
||||
}
|
||||
let warc_bytes = resp.bytes().await.map_err(|e| format!("WARC body read failed: {e}"))?;
|
||||
|
||||
// Extract text from WARC record
|
||||
let (title, content) = self.extract_from_warc(&warc_bytes)?;
|
||||
// Check if this is a Wayback Machine record (filename = "wayback:{timestamp}")
|
||||
let (title, content) = if record.filename.starts_with("wayback:") {
|
||||
// Fetch from Internet Archive Wayback Machine
|
||||
let timestamp = &record.filename[8..]; // Extract timestamp after "wayback:"
|
||||
// Use id_ modifier for raw content without Wayback toolbar
|
||||
let wayback_url = format!(
|
||||
"https://web.archive.org/web/{}id_/{}",
|
||||
timestamp, record.url
|
||||
);
|
||||
tracing::info!("Fetching from Wayback: {}", wayback_url);
|
||||
|
||||
let resp = self.http.get(&wayback_url)
|
||||
.send().await
|
||||
.map_err(|e| format!("Wayback fetch failed for {}: {e}", record.url))?;
|
||||
|
||||
if !resp.status().is_success() {
|
||||
return Err(format!("Wayback returned status {}", resp.status()));
|
||||
}
|
||||
|
||||
let html_bytes = resp.bytes().await
|
||||
.map_err(|e| format!("Wayback body read failed: {e}"))?;
|
||||
|
||||
// Extract directly from HTML (no WARC envelope)
|
||||
self.extract_from_html(&html_bytes)?
|
||||
} else {
|
||||
// Standard Common Crawl WARC fetch
|
||||
if record.length == 0 {
|
||||
return Err("Invalid CDX record: missing length".into());
|
||||
}
|
||||
let warc_url = format!("{}/{}", self.data_base, record.filename);
|
||||
let range = format!("bytes={}-{}", record.offset, record.offset + record.length - 1);
|
||||
|
||||
let resp = self.http.get(&warc_url)
|
||||
.header("Range", &range)
|
||||
.send().await.map_err(|e| format!("WARC fetch failed for {}: {e}", record.url))?;
|
||||
if !resp.status().is_success() && resp.status().as_u16() != 206 {
|
||||
return Err(format!("WARC returned status {}", resp.status()));
|
||||
}
|
||||
let warc_bytes = resp.bytes().await.map_err(|e| format!("WARC body read failed: {e}"))?;
|
||||
|
||||
// Extract text from WARC record
|
||||
self.extract_from_warc(&warc_bytes)?
|
||||
};
|
||||
let content_hash = DataInjector::content_hash(&title, &content);
|
||||
|
||||
// Check for duplicate content
|
||||
|
|
@ -917,6 +1004,12 @@ impl CommonCrawlAdapter {
|
|||
})
|
||||
}
|
||||
|
||||
/// Extract title and text content from raw HTML bytes (Wayback Machine).
|
||||
fn extract_from_html(&self, html_bytes: &[u8]) -> Result<(String, String), String> {
|
||||
let html = String::from_utf8_lossy(html_bytes);
|
||||
self.extract_text_from_html(&html)
|
||||
}
|
||||
|
||||
/// Extract title and text content from WARC record bytes.
|
||||
fn extract_from_warc(&self, warc_bytes: &[u8]) -> Result<(String, String), String> {
|
||||
let warc_str = String::from_utf8_lossy(warc_bytes);
|
||||
|
|
@ -927,6 +1020,11 @@ impl CommonCrawlAdapter {
|
|||
.unwrap_or(0);
|
||||
let html = &warc_str[body_start..];
|
||||
|
||||
self.extract_text_from_html(html)
|
||||
}
|
||||
|
||||
/// Shared text extraction logic for both WARC and raw HTML.
|
||||
fn extract_text_from_html(&self, html: &str) -> Result<(String, String), String> {
|
||||
// Extract title
|
||||
let title = extract_tag(html, "title").unwrap_or_default();
|
||||
|
||||
|
|
|
|||
|
|
@ -28,4 +28,4 @@ getrandom = { version = "0.2", features = ["js"] }
|
|||
default = []
|
||||
|
||||
[package.metadata.wasm-pack.profile.release]
|
||||
wasm-opt = ["-O4"]
|
||||
wasm-opt = false
|
||||
|
|
|
|||
|
|
@ -380,7 +380,7 @@ mod tests {
|
|||
fn test_force_learn() {
|
||||
let engine = SonaEngine::new(256);
|
||||
|
||||
for i in 0..150 {
|
||||
for _i in 0..150 {
|
||||
let mut builder = engine.begin_trajectory(vec![0.1; 256]);
|
||||
builder.add_step(vec![0.5; 256], vec![], 0.8);
|
||||
engine.end_trajectory(builder, 0.8);
|
||||
|
|
@ -390,6 +390,27 @@ mod tests {
|
|||
assert!(result.contains("150 trajectories"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_force_learn_with_few_trajectories() {
|
||||
// Test that forceLearn works even with fewer than min_trajectories (100)
|
||||
let engine = SonaEngine::new(64);
|
||||
|
||||
// Only record 10 trajectories (below the 100 minimum)
|
||||
for _i in 0..10 {
|
||||
let mut builder = engine.begin_trajectory(vec![0.1; 64]);
|
||||
builder.add_step(vec![0.5; 64], vec![], 0.8);
|
||||
engine.end_trajectory(builder, 0.8);
|
||||
}
|
||||
|
||||
let result = engine.force_learn();
|
||||
// Should process 10 trajectories (not "insufficient trajectories")
|
||||
assert!(
|
||||
result.contains("10 trajectories"),
|
||||
"Expected '10 trajectories' but got: {}",
|
||||
result
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_disabled_engine() {
|
||||
let mut engine = SonaEngine::new(64);
|
||||
|
|
|
|||
|
|
@ -105,9 +105,19 @@ impl BackgroundLoop {
|
|||
}
|
||||
|
||||
/// Run background learning cycle
|
||||
pub fn run_cycle(&self, trajectories: Vec<QueryTrajectory>) -> BackgroundResult {
|
||||
if trajectories.len() < self.config.min_trajectories {
|
||||
return BackgroundResult::skipped("insufficient trajectories");
|
||||
///
|
||||
/// If `force` is true, bypasses the minimum trajectory check (for forceLearn API)
|
||||
pub fn run_cycle(&self, trajectories: Vec<QueryTrajectory>, force: bool) -> BackgroundResult {
|
||||
if !force && trajectories.len() < self.config.min_trajectories {
|
||||
return BackgroundResult::skipped(&format!(
|
||||
"insufficient trajectories ({} < {} minimum, use forceLearn to bypass)",
|
||||
trajectories.len(),
|
||||
self.config.min_trajectories
|
||||
));
|
||||
}
|
||||
|
||||
if trajectories.is_empty() {
|
||||
return BackgroundResult::skipped("no trajectories to process");
|
||||
}
|
||||
|
||||
let start = Instant::now();
|
||||
|
|
|
|||
|
|
@ -97,17 +97,17 @@ impl LoopCoordinator {
|
|||
if self.background.should_run() {
|
||||
let trajectories = self.instant.drain_trajectories();
|
||||
if !trajectories.is_empty() {
|
||||
return Some(self.background.run_cycle(trajectories));
|
||||
return Some(self.background.run_cycle(trajectories, false));
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Force background cycle
|
||||
/// Force background cycle (bypasses minimum trajectory check)
|
||||
pub fn force_background(&self) -> BackgroundResult {
|
||||
let trajectories = self.instant.drain_trajectories();
|
||||
self.background.run_cycle(trajectories)
|
||||
self.background.run_cycle(trajectories, true)
|
||||
}
|
||||
|
||||
/// Flush instant loop updates
|
||||
|
|
|
|||
|
|
@ -428,7 +428,7 @@ class Intelligence {
|
|||
const server = new Server(
|
||||
{
|
||||
name: 'ruvector',
|
||||
version: '0.2.12',
|
||||
version: '0.2.13',
|
||||
},
|
||||
{
|
||||
capabilities: {
|
||||
|
|
@ -3054,9 +3054,15 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|||
}
|
||||
|
||||
case 'workers_create': {
|
||||
const name = args.name;
|
||||
const preset = args.preset || 'quick-scan';
|
||||
const triggers = args.triggers;
|
||||
const name = sanitizeShellArg(args.name);
|
||||
const preset = sanitizeShellArg(args.preset || 'quick-scan');
|
||||
const triggers = args.triggers ? sanitizeShellArg(args.triggers) : null;
|
||||
if (!name) {
|
||||
return { content: [{ type: 'text', text: JSON.stringify({
|
||||
success: false,
|
||||
error: 'Invalid worker name'
|
||||
}, null, 2) }] };
|
||||
}
|
||||
try {
|
||||
let cmd = `npx agentic-flow@alpha workers create "${name}" --preset ${preset}`;
|
||||
if (triggers) cmd += ` --triggers "${triggers}"`;
|
||||
|
|
@ -4132,7 +4138,7 @@ async function main() {
|
|||
transport: 'sse',
|
||||
sessions: sessions.size,
|
||||
tools: 91,
|
||||
version: '0.2.12'
|
||||
version: '0.2.13'
|
||||
}));
|
||||
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "ruvector",
|
||||
"version": "0.2.12",
|
||||
"version": "0.2.13",
|
||||
"description": "High-performance vector database for Node.js with automatic native/WASM fallback",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
|
|
|
|||
|
|
@ -227,8 +227,32 @@ class SonaEngine {
|
|||
* @returns Statistics object
|
||||
*/
|
||||
getStats() {
|
||||
const statsJson = this._native.getStats();
|
||||
return JSON.parse(statsJson);
|
||||
const statsStr = this._native.getStats();
|
||||
// Try JSON first (ideal format)
|
||||
try {
|
||||
return JSON.parse(statsStr);
|
||||
} catch {
|
||||
// Fall back to parsing Rust debug format: "StructName { field: value, ... }"
|
||||
// e.g., "CoordinatorStats { trajectories_buffered: 0, ... }"
|
||||
const match = statsStr.match(/\{([^}]+)\}/);
|
||||
if (match) {
|
||||
const obj = {};
|
||||
const pairs = match[1].split(',').map(s => s.trim());
|
||||
for (const pair of pairs) {
|
||||
const [key, val] = pair.split(':').map(s => s.trim());
|
||||
if (key && val !== undefined) {
|
||||
// Parse value: bool, number, or string
|
||||
if (val === 'true') obj[key] = true;
|
||||
else if (val === 'false') obj[key] = false;
|
||||
else if (!isNaN(parseFloat(val))) obj[key] = parseFloat(val);
|
||||
else obj[key] = val;
|
||||
}
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
// Return raw string if all parsing fails
|
||||
return { raw: statsStr };
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Enable or disable the engine
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue