mirror of
https://github.com/block/goose.git
synced 2026-04-28 03:29:36 +00:00
feat: adversarial agent for preventing leaking of info and more (#7948)
Some checks failed
Canary / Prepare Version (push) Failing after 3s
Canary / bundle-desktop (push) Has been skipped
Canary / bundle-desktop-intel (push) Has been skipped
Canary / bundle-desktop-linux (push) Has been skipped
Canary / bundle-desktop-windows (push) Has been skipped
Canary / build-cli (push) Has been skipped
Canary / Upload Install Script (push) Has been skipped
Canary / Release (push) Has been skipped
Unused Dependencies / machete (push) Has been skipped
CI / changes (push) Failing after 3s
Deploy Documentation / deploy (push) Failing after 4s
Live Provider Tests / check-fork (push) Successful in 2s
Publish Ask AI Bot Docker Image / docker (push) Failing after 5s
Publish Docker Image / docker (push) Failing after 3s
Scorecard supply-chain security / Scorecard analysis (push) Has been skipped
CI / Check Rust Code Format (push) Has been skipped
CI / Build and Test Rust Project (push) Has been skipped
CI / Lint Rust Code (push) Has been skipped
CI / Check OpenAPI Schema is Up-to-Date (push) Has been skipped
Live Provider Tests / changes (push) Failing after 3s
Live Provider Tests / Build Binary (push) Has been skipped
Live Provider Tests / Smoke Tests (Code Execution) (push) Has been skipped
Live Provider Tests / Smoke Tests (push) Has been skipped
Live Provider Tests / Compaction Tests (push) Has been skipped
Live Provider Tests / goose server HTTP integration tests (push) Has been skipped
CI / Test and Lint Electron Desktop App (push) Has been cancelled
Some checks failed
Canary / Prepare Version (push) Failing after 3s
Canary / bundle-desktop (push) Has been skipped
Canary / bundle-desktop-intel (push) Has been skipped
Canary / bundle-desktop-linux (push) Has been skipped
Canary / bundle-desktop-windows (push) Has been skipped
Canary / build-cli (push) Has been skipped
Canary / Upload Install Script (push) Has been skipped
Canary / Release (push) Has been skipped
Unused Dependencies / machete (push) Has been skipped
CI / changes (push) Failing after 3s
Deploy Documentation / deploy (push) Failing after 4s
Live Provider Tests / check-fork (push) Successful in 2s
Publish Ask AI Bot Docker Image / docker (push) Failing after 5s
Publish Docker Image / docker (push) Failing after 3s
Scorecard supply-chain security / Scorecard analysis (push) Has been skipped
CI / Check Rust Code Format (push) Has been skipped
CI / Build and Test Rust Project (push) Has been skipped
CI / Lint Rust Code (push) Has been skipped
CI / Check OpenAPI Schema is Up-to-Date (push) Has been skipped
Live Provider Tests / changes (push) Failing after 3s
Live Provider Tests / Build Binary (push) Has been skipped
Live Provider Tests / Smoke Tests (Code Execution) (push) Has been skipped
Live Provider Tests / Smoke Tests (push) Has been skipped
Live Provider Tests / Compaction Tests (push) Has been skipped
Live Provider Tests / goose server HTTP integration tests (push) Has been skipped
CI / Test and Lint Electron Desktop App (push) Has been cancelled
This commit is contained in:
parent
a0835be10f
commit
754c214df4
6 changed files with 904 additions and 0 deletions
|
|
@ -44,6 +44,7 @@ use crate::providers::base::{PermissionRouting, Provider};
|
|||
use crate::providers::errors::ProviderError;
|
||||
use crate::recipe::{Author, Recipe, Response, Settings};
|
||||
use crate::scheduler_trait::SchedulerTrait;
|
||||
use crate::security::adversary_inspector::AdversaryInspector;
|
||||
use crate::security::security_inspector::SecurityInspector;
|
||||
use crate::session::extension_data::{EnabledExtensionsState, ExtensionState};
|
||||
use crate::session::{Session, SessionManager};
|
||||
|
|
@ -263,6 +264,9 @@ impl Agent {
|
|||
// Add security inspector (highest priority - runs first)
|
||||
tool_inspection_manager.add_inspector(Box::new(SecurityInspector::new()));
|
||||
|
||||
// Add adversary inspector (LLM-based review, enabled by ~/.config/goose/adversary.md)
|
||||
tool_inspection_manager.add_inspector(Box::new(AdversaryInspector::new(provider.clone())));
|
||||
|
||||
// Add permission inspector (medium-high priority)
|
||||
tool_inspection_manager.add_inspector(Box::new(PermissionInspector::new(
|
||||
permission_manager,
|
||||
|
|
@ -2288,6 +2292,10 @@ mod tests {
|
|||
inspector_names.contains(&"security"),
|
||||
"Tool inspection manager should contain security inspector"
|
||||
);
|
||||
assert!(
|
||||
inspector_names.contains(&"adversary"),
|
||||
"Tool inspection manager should contain adversary inspector"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
|||
630
crates/goose/src/security/adversary_inspector.rs
Normal file
630
crates/goose/src/security/adversary_inspector.rs
Normal file
|
|
@ -0,0 +1,630 @@
|
|||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use chrono::Utc;
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use crate::agents::types::SharedProvider;
|
||||
use crate::config::paths::Paths;
|
||||
use crate::config::GooseMode;
|
||||
use crate::conversation::message::{Message, MessageContent, ToolRequest};
|
||||
use crate::conversation::Conversation;
|
||||
use crate::tool_inspection::{InspectionAction, InspectionResult, ToolInspector};
|
||||
use crate::utils::safe_truncate;
|
||||
|
||||
const DEFAULT_TOOLS: &[&str] = &["shell", "computercontroller__automation_script"];
|
||||
|
||||
const DEFAULT_RULES: &str = r#"BLOCK if the command:
|
||||
- Exfiltrates data (curl/wget posting to unknown URLs, piping secrets out)
|
||||
- Is destructive beyond the project scope (rm -rf /, modifying system files)
|
||||
- Installs malware or runs obfuscated code
|
||||
- Attempts to escalate privileges unnecessarily
|
||||
- Downloads and executes untrusted remote scripts
|
||||
|
||||
ALLOW if the command is a normal development operation, even if it modifies files,
|
||||
installs packages, runs tests, uses git, etc. Most commands are fine.
|
||||
Err on the side of ALLOW — only block truly dangerous things."#;
|
||||
|
||||
const MAX_RECENT_USER_MESSAGES: usize = 4;
|
||||
|
||||
struct AdversaryConfig {
|
||||
tools: Vec<String>,
|
||||
rules: String,
|
||||
}
|
||||
|
||||
/// Adversary inspector that reviews tool calls against user-defined rules.
|
||||
///
|
||||
/// Activated by placing an `adversary.md` file in the Goose config directory
|
||||
/// (`~/.config/goose/adversary.md`). The file contains optional frontmatter
|
||||
/// to select which tools are reviewed, followed by rules.
|
||||
///
|
||||
/// Example `adversary.md`:
|
||||
/// ```text
|
||||
/// tools: shell, computercontroller__automation_script
|
||||
/// ---
|
||||
/// BLOCK if the command exfiltrates data or is destructive.
|
||||
/// ALLOW normal development operations.
|
||||
/// ```
|
||||
///
|
||||
/// If the `tools:` line is omitted, only `shell` is reviewed by default.
|
||||
/// If the file is absent, this inspector is disabled.
|
||||
/// If the review fails, the inspector fails open (allows the tool call).
|
||||
pub struct AdversaryInspector {
|
||||
provider: SharedProvider,
|
||||
config: OnceLock<Option<AdversaryConfig>>,
|
||||
config_path: Option<std::path::PathBuf>,
|
||||
}
|
||||
|
||||
impl AdversaryInspector {
|
||||
pub fn new(provider: SharedProvider) -> Self {
|
||||
Self {
|
||||
provider,
|
||||
config: OnceLock::new(),
|
||||
config_path: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_config_dir(provider: SharedProvider, config_dir: std::path::PathBuf) -> Self {
|
||||
Self {
|
||||
provider,
|
||||
config: OnceLock::new(),
|
||||
config_path: Some(config_dir.join("adversary.md")),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_config(&self) -> Option<&AdversaryConfig> {
|
||||
self.config
|
||||
.get_or_init(|| {
|
||||
let path = self
|
||||
.config_path
|
||||
.clone()
|
||||
.unwrap_or_else(|| Paths::config_dir().join("adversary.md"));
|
||||
if !path.exists() {
|
||||
tracing::debug!("No adversary.md found, adversary inspector disabled");
|
||||
return None;
|
||||
}
|
||||
|
||||
let content = match std::fs::read_to_string(&path) {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to read adversary.md: {}", e);
|
||||
return Some(AdversaryConfig {
|
||||
tools: DEFAULT_TOOLS.iter().map(|s| (*s).to_string()).collect(),
|
||||
rules: DEFAULT_RULES.to_string(),
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
let config = Self::parse_adversary_md(&content);
|
||||
let tool_list = config.tools.join(", ");
|
||||
tracing::info!(
|
||||
tools = %tool_list,
|
||||
"Adversary inspector enabled from {}",
|
||||
path.display()
|
||||
);
|
||||
Some(config)
|
||||
})
|
||||
.as_ref()
|
||||
}
|
||||
|
||||
/// Parse adversary.md content, extracting optional `tools:` frontmatter.
|
||||
///
|
||||
/// Format:
|
||||
/// ```text
|
||||
/// tools: shell, computercontroller__automation_script
|
||||
/// ---
|
||||
/// BLOCK if ...
|
||||
/// ```
|
||||
///
|
||||
/// If no `tools:` line or `---` separator, the entire content is rules
|
||||
/// and tools defaults to `["shell"]`.
|
||||
fn parse_adversary_md(content: &str) -> AdversaryConfig {
|
||||
let trimmed = content.trim();
|
||||
if trimmed.is_empty() {
|
||||
return AdversaryConfig {
|
||||
tools: DEFAULT_TOOLS.iter().map(|s| (*s).to_string()).collect(),
|
||||
rules: DEFAULT_RULES.to_string(),
|
||||
};
|
||||
}
|
||||
|
||||
// Look for frontmatter: lines before a `---` separator
|
||||
if let Some((frontmatter, rest)) = trimmed.split_once("\n---") {
|
||||
let rules = rest.trim();
|
||||
|
||||
let mut tools: Option<Vec<String>> = None;
|
||||
for line in frontmatter.lines() {
|
||||
let line = line.trim();
|
||||
if let Some(value) = line.strip_prefix("tools:") {
|
||||
tools = Some(
|
||||
value
|
||||
.split(',')
|
||||
.map(|t| t.trim().to_string())
|
||||
.filter(|t| !t.is_empty())
|
||||
.collect(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let rules = if rules.is_empty() {
|
||||
DEFAULT_RULES.to_string()
|
||||
} else {
|
||||
rules.to_string()
|
||||
};
|
||||
|
||||
AdversaryConfig {
|
||||
tools: tools
|
||||
.unwrap_or_else(|| DEFAULT_TOOLS.iter().map(|s| (*s).to_string()).collect()),
|
||||
rules,
|
||||
}
|
||||
} else {
|
||||
// No frontmatter — entire content is rules
|
||||
AdversaryConfig {
|
||||
tools: DEFAULT_TOOLS.iter().map(|s| (*s).to_string()).collect(),
|
||||
rules: trimmed.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn should_review(config: &AdversaryConfig, tool_request: &ToolRequest) -> bool {
|
||||
let tool_name = match &tool_request.tool_call {
|
||||
Ok(tc) => tc.name.as_ref(),
|
||||
Err(_) => return false,
|
||||
};
|
||||
config.tools.iter().any(|t| t == tool_name)
|
||||
}
|
||||
|
||||
fn format_tool_call(tool_request: &ToolRequest) -> String {
|
||||
match &tool_request.tool_call {
|
||||
Ok(tc) => {
|
||||
let mut s = format!("Tool: {}", tc.name);
|
||||
if let Some(args) = &tc.arguments {
|
||||
if let Some(cmd) = args.get("command").and_then(|v| v.as_str()) {
|
||||
s = format!("Tool: {} — command: {}", tc.name, cmd);
|
||||
} else if let Ok(json) = serde_json::to_string_pretty(args) {
|
||||
s.push_str("\nArguments: ");
|
||||
s.push_str(&json);
|
||||
}
|
||||
}
|
||||
s
|
||||
}
|
||||
Err(e) => format!("(malformed tool call: {})", e),
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_recent_user_messages(messages: &[Message], count: usize) -> Vec<String> {
|
||||
messages
|
||||
.iter()
|
||||
.rev()
|
||||
.filter(|m| m.role == rmcp::model::Role::User)
|
||||
.filter_map(|m| {
|
||||
let text: String = m
|
||||
.content
|
||||
.iter()
|
||||
.filter_map(|c| match c {
|
||||
MessageContent::Text(t) => Some(t.text.clone()),
|
||||
_ => None,
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
if text.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(text)
|
||||
}
|
||||
})
|
||||
.take(count)
|
||||
.collect::<Vec<_>>()
|
||||
.into_iter()
|
||||
.rev()
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn extract_original_task(messages: &[Message]) -> String {
|
||||
for msg in messages {
|
||||
if msg.role == rmcp::model::Role::User {
|
||||
let text: String = msg
|
||||
.content
|
||||
.iter()
|
||||
.filter_map(|c| match c {
|
||||
MessageContent::Text(t) => Some(t.text.clone()),
|
||||
_ => None,
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
if !text.is_empty() {
|
||||
return safe_truncate(&text, 500);
|
||||
}
|
||||
}
|
||||
}
|
||||
"(unknown)".to_string()
|
||||
}
|
||||
|
||||
async fn consult_llm(
|
||||
&self,
|
||||
tool_description: &str,
|
||||
original_task: &str,
|
||||
recent_messages: &[String],
|
||||
rules: &str,
|
||||
) -> Result<(bool, String)> {
|
||||
let provider_guard = self.provider.lock().await;
|
||||
let provider = match provider_guard.clone() {
|
||||
Some(p) => p,
|
||||
None => return Ok((true, "No provider available".to_string())),
|
||||
};
|
||||
drop(provider_guard);
|
||||
|
||||
let history_section = if !recent_messages.is_empty() {
|
||||
let mut s = String::from("Recent user messages (oldest first):\n");
|
||||
for (i, msg) in recent_messages.iter().enumerate() {
|
||||
let truncated = safe_truncate(msg, 200);
|
||||
s.push_str(&format!("{}. {}\n", i + 1, truncated));
|
||||
}
|
||||
s.push('\n');
|
||||
s
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
let user_message = format!(
|
||||
"The user's original task: {}\n\n\
|
||||
{}\
|
||||
The tool call to review:\n\
|
||||
```\n\
|
||||
{}\n\
|
||||
```\n\n\
|
||||
{}\n\n\
|
||||
Respond with a single word on the first line: ALLOW or BLOCK\n\
|
||||
Then on the next line, a brief reason.",
|
||||
original_task, history_section, tool_description, rules
|
||||
);
|
||||
|
||||
let system_prompt = "You are an adversarial security reviewer, protecting the user \
|
||||
in case the other agent is rogue. An AI coding agent is about to execute a tool \
|
||||
call. Your ONLY job: decide if this tool call is safe given the user's task and \
|
||||
rules. Respond with ALLOW or BLOCK on the first line, then a brief reason on \
|
||||
the next line.";
|
||||
|
||||
let check_messages = vec![Message::new(
|
||||
rmcp::model::Role::User,
|
||||
Utc::now().timestamp(),
|
||||
vec![MessageContent::text(user_message)],
|
||||
)];
|
||||
let conversation = Conversation::new_unvalidated(check_messages);
|
||||
|
||||
let model_config = provider.get_model_config();
|
||||
let (response, _usage) = provider
|
||||
.complete(
|
||||
&model_config,
|
||||
"",
|
||||
system_prompt,
|
||||
conversation.messages(),
|
||||
&[],
|
||||
)
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!("Adversary LLM call failed: {}", e))?;
|
||||
|
||||
let output: String = response
|
||||
.content
|
||||
.iter()
|
||||
.filter_map(|c| match c {
|
||||
MessageContent::Text(t) => Some(t.text.clone()),
|
||||
_ => None,
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
|
||||
let output = output.trim();
|
||||
let upper = output.to_uppercase();
|
||||
|
||||
if upper.starts_with("BLOCK") || upper.contains("\nBLOCK") {
|
||||
let reason = output
|
||||
.lines()
|
||||
.skip(1)
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
.trim()
|
||||
.to_string();
|
||||
let reason = if reason.is_empty() {
|
||||
"Blocked by adversary".to_string()
|
||||
} else {
|
||||
reason
|
||||
};
|
||||
Ok((false, reason))
|
||||
} else {
|
||||
let reason = output
|
||||
.lines()
|
||||
.skip(1)
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
.trim()
|
||||
.to_string();
|
||||
Ok((true, reason))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ToolInspector for AdversaryInspector {
|
||||
fn name(&self) -> &'static str {
|
||||
"adversary"
|
||||
}
|
||||
|
||||
fn as_any(&self) -> &dyn std::any::Any {
|
||||
self
|
||||
}
|
||||
|
||||
fn is_enabled(&self) -> bool {
|
||||
self.get_config().is_some()
|
||||
}
|
||||
|
||||
async fn inspect(
|
||||
&self,
|
||||
_session_id: &str,
|
||||
tool_requests: &[ToolRequest],
|
||||
messages: &[Message],
|
||||
_goose_mode: GooseMode,
|
||||
) -> Result<Vec<InspectionResult>> {
|
||||
let config = match self.get_config() {
|
||||
Some(c) => c,
|
||||
None => return Ok(vec![]),
|
||||
};
|
||||
|
||||
let original_task = Self::extract_original_task(messages);
|
||||
let recent_messages =
|
||||
Self::extract_recent_user_messages(messages, MAX_RECENT_USER_MESSAGES);
|
||||
|
||||
let mut results = Vec::new();
|
||||
|
||||
for request in tool_requests {
|
||||
if !Self::should_review(config, request) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let tool_description = Self::format_tool_call(request);
|
||||
|
||||
tracing::debug!(
|
||||
tool_request_id = %request.id,
|
||||
"Adversary inspector reviewing tool call"
|
||||
);
|
||||
|
||||
match self
|
||||
.consult_llm(
|
||||
&tool_description,
|
||||
&original_task,
|
||||
&recent_messages,
|
||||
&config.rules,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok((true, reason)) => {
|
||||
tracing::debug!(
|
||||
tool_request_id = %request.id,
|
||||
reason = %reason,
|
||||
"Adversary: ALLOW"
|
||||
);
|
||||
results.push(InspectionResult {
|
||||
tool_request_id: request.id.clone(),
|
||||
action: InspectionAction::Allow,
|
||||
reason: format!("Adversary: {}", reason),
|
||||
confidence: 1.0,
|
||||
inspector_name: self.name().to_string(),
|
||||
finding_id: None,
|
||||
});
|
||||
}
|
||||
Ok((false, reason)) => {
|
||||
tracing::warn!(
|
||||
tool_request_id = %request.id,
|
||||
reason = %reason,
|
||||
"Adversary: BLOCK"
|
||||
);
|
||||
results.push(InspectionResult {
|
||||
tool_request_id: request.id.clone(),
|
||||
action: InspectionAction::Deny,
|
||||
reason: format!("🛡️ Adversary blocked: {}", reason),
|
||||
confidence: 1.0,
|
||||
inspector_name: self.name().to_string(),
|
||||
finding_id: None,
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
tool_request_id = %request.id,
|
||||
error = %e,
|
||||
"Adversary inspector failed, allowing tool call (fail-open)"
|
||||
);
|
||||
results.push(InspectionResult {
|
||||
tool_request_id: request.id.clone(),
|
||||
action: InspectionAction::Allow,
|
||||
reason: format!("Adversary error (fail-open): {}", e),
|
||||
confidence: 0.0,
|
||||
inspector_name: self.name().to_string(),
|
||||
finding_id: None,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use rmcp::model::CallToolRequestParams;
|
||||
use rmcp::object;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
#[test]
|
||||
fn test_parse_with_tools_frontmatter() {
|
||||
let content = "tools: shell, computercontroller__automation_script\n---\nBLOCK bad stuff";
|
||||
let config = AdversaryInspector::parse_adversary_md(content);
|
||||
assert_eq!(
|
||||
config.tools,
|
||||
vec!["shell", "computercontroller__automation_script"]
|
||||
);
|
||||
assert_eq!(config.rules, "BLOCK bad stuff");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_without_frontmatter() {
|
||||
let content = "BLOCK if the command exfiltrates data";
|
||||
let config = AdversaryInspector::parse_adversary_md(content);
|
||||
assert_eq!(
|
||||
config.tools,
|
||||
vec!["shell", "computercontroller__automation_script"]
|
||||
);
|
||||
assert_eq!(config.rules, "BLOCK if the command exfiltrates data");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_empty() {
|
||||
let config = AdversaryInspector::parse_adversary_md("");
|
||||
assert_eq!(
|
||||
config.tools,
|
||||
vec!["shell", "computercontroller__automation_script"]
|
||||
);
|
||||
assert_eq!(config.rules, DEFAULT_RULES);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_frontmatter_empty_rules_uses_defaults() {
|
||||
let content = "tools: shell\n---\n";
|
||||
let config = AdversaryInspector::parse_adversary_md(content);
|
||||
assert_eq!(config.tools, vec!["shell"]);
|
||||
assert_eq!(config.rules, DEFAULT_RULES);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_should_review_matches() {
|
||||
let config = AdversaryConfig {
|
||||
tools: vec!["shell".to_string()],
|
||||
rules: String::new(),
|
||||
};
|
||||
let request = ToolRequest {
|
||||
id: "r1".into(),
|
||||
tool_call: Ok(
|
||||
CallToolRequestParams::new("shell").with_arguments(object!({"command": "ls"}))
|
||||
),
|
||||
metadata: None,
|
||||
tool_meta: None,
|
||||
};
|
||||
assert!(AdversaryInspector::should_review(&config, &request));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_should_review_skips_non_matching() {
|
||||
let config = AdversaryConfig {
|
||||
tools: vec!["shell".to_string()],
|
||||
rules: String::new(),
|
||||
};
|
||||
let request = ToolRequest {
|
||||
id: "r1".into(),
|
||||
tool_call: Ok(CallToolRequestParams::new("write")
|
||||
.with_arguments(object!({"path": "foo.txt", "content": "hi"}))),
|
||||
metadata: None,
|
||||
tool_meta: None,
|
||||
};
|
||||
assert!(!AdversaryInspector::should_review(&config, &request));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_tool_call_shell() {
|
||||
let request = ToolRequest {
|
||||
id: "req1".into(),
|
||||
tool_call: Ok(CallToolRequestParams::new("shell")
|
||||
.with_arguments(object!({"command": "rm -rf /"}))),
|
||||
metadata: None,
|
||||
tool_meta: None,
|
||||
};
|
||||
let formatted = AdversaryInspector::format_tool_call(&request);
|
||||
assert!(formatted.contains("shell"));
|
||||
assert!(formatted.contains("rm -rf /"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_tool_call_write() {
|
||||
let request = ToolRequest {
|
||||
id: "req2".into(),
|
||||
tool_call: Ok(CallToolRequestParams::new("write")
|
||||
.with_arguments(object!({"path": "/etc/passwd", "content": "hacked"}))),
|
||||
metadata: None,
|
||||
tool_meta: None,
|
||||
};
|
||||
let formatted = AdversaryInspector::format_tool_call(&request);
|
||||
assert!(formatted.contains("write"));
|
||||
assert!(formatted.contains("/etc/passwd"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_original_task() {
|
||||
let messages = vec![
|
||||
Message::new(
|
||||
rmcp::model::Role::User,
|
||||
Utc::now().timestamp(),
|
||||
vec![MessageContent::text("Refactor the auth module")],
|
||||
),
|
||||
Message::new(
|
||||
rmcp::model::Role::Assistant,
|
||||
Utc::now().timestamp(),
|
||||
vec![MessageContent::text("Sure, I'll start by...")],
|
||||
),
|
||||
];
|
||||
let task = AdversaryInspector::extract_original_task(&messages);
|
||||
assert_eq!(task, "Refactor the auth module");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_recent_user_messages() {
|
||||
let messages = vec![
|
||||
Message::new(
|
||||
rmcp::model::Role::User,
|
||||
Utc::now().timestamp(),
|
||||
vec![MessageContent::text("First message")],
|
||||
),
|
||||
Message::new(
|
||||
rmcp::model::Role::Assistant,
|
||||
Utc::now().timestamp(),
|
||||
vec![MessageContent::text("Response")],
|
||||
),
|
||||
Message::new(
|
||||
rmcp::model::Role::User,
|
||||
Utc::now().timestamp(),
|
||||
vec![MessageContent::text("Second message")],
|
||||
),
|
||||
Message::new(
|
||||
rmcp::model::Role::User,
|
||||
Utc::now().timestamp(),
|
||||
vec![MessageContent::text("Third message")],
|
||||
),
|
||||
];
|
||||
let recent = AdversaryInspector::extract_recent_user_messages(&messages, 2);
|
||||
assert_eq!(recent.len(), 2);
|
||||
assert_eq!(recent[0], "Second message");
|
||||
assert_eq!(recent[1], "Third message");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_disabled_when_no_adversary_md() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
|
||||
let provider: SharedProvider = Arc::new(Mutex::new(None));
|
||||
let inspector = AdversaryInspector::with_config_dir(provider, tmp.path().to_path_buf());
|
||||
assert!(!inspector.is_enabled());
|
||||
|
||||
let request = ToolRequest {
|
||||
id: "req1".into(),
|
||||
tool_call: Ok(
|
||||
CallToolRequestParams::new("shell").with_arguments(object!({"command": "ls"}))
|
||||
),
|
||||
metadata: None,
|
||||
tool_meta: None,
|
||||
};
|
||||
|
||||
let results = inspector
|
||||
.inspect("test", &[request], &[], GooseMode::Auto)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(results.is_empty());
|
||||
}
|
||||
}
|
||||
|
|
@ -1,3 +1,4 @@
|
|||
pub mod adversary_inspector;
|
||||
pub mod classification_client;
|
||||
pub mod patterns;
|
||||
pub mod scanner;
|
||||
|
|
|
|||
172
crates/goose/tests/adversary_inspector_tests.rs
Normal file
172
crates/goose/tests/adversary_inspector_tests.rs
Normal file
|
|
@ -0,0 +1,172 @@
|
|||
use goose::config::GooseMode;
|
||||
use goose::conversation::message::{Message, MessageContent, ToolRequest};
|
||||
use goose::security::adversary_inspector::AdversaryInspector;
|
||||
use goose::tool_inspection::ToolInspector;
|
||||
use rmcp::model::CallToolRequestParams;
|
||||
use rmcp::object;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
fn make_request(
|
||||
id: &str,
|
||||
tool: &str,
|
||||
args: serde_json::Map<String, serde_json::Value>,
|
||||
) -> ToolRequest {
|
||||
ToolRequest {
|
||||
id: id.into(),
|
||||
tool_call: Ok(CallToolRequestParams::new(tool.to_string()).with_arguments(args)),
|
||||
metadata: None,
|
||||
tool_meta: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn write_adversary_md(dir: &std::path::Path, content: &str) {
|
||||
std::fs::create_dir_all(dir).unwrap();
|
||||
std::fs::write(dir.join("adversary.md"), content).unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_adversary_disabled_without_config_file() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
|
||||
let provider = Arc::new(Mutex::new(None));
|
||||
let inspector = AdversaryInspector::with_config_dir(provider, tmp.path().to_path_buf());
|
||||
|
||||
assert_eq!(inspector.name(), "adversary");
|
||||
assert!(!inspector.is_enabled());
|
||||
|
||||
let results = inspector
|
||||
.inspect(
|
||||
"test-session",
|
||||
&[make_request(
|
||||
"r1",
|
||||
"shell",
|
||||
object!({"command": "rm -rf /"}),
|
||||
)],
|
||||
&[],
|
||||
GooseMode::SmartApprove,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert!(results.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_adversary_enabled_default_tools() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
write_adversary_md(tmp.path(), "BLOCK everything for testing");
|
||||
|
||||
let provider = Arc::new(Mutex::new(None));
|
||||
let inspector = AdversaryInspector::with_config_dir(provider, tmp.path().to_path_buf());
|
||||
|
||||
assert!(inspector.is_enabled());
|
||||
|
||||
let messages = vec![Message::new(
|
||||
rmcp::model::Role::User,
|
||||
chrono::Utc::now().timestamp(),
|
||||
vec![MessageContent::text("build the project")],
|
||||
)];
|
||||
|
||||
// shell is reviewed by default — no provider means fail-open (Allow)
|
||||
let results = inspector
|
||||
.inspect(
|
||||
"test-session",
|
||||
&[make_request(
|
||||
"r1",
|
||||
"shell",
|
||||
object!({"command": "cargo build"}),
|
||||
)],
|
||||
&messages,
|
||||
GooseMode::SmartApprove,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
assert!(matches!(
|
||||
results[0].action,
|
||||
goose::tool_inspection::InspectionAction::Allow
|
||||
));
|
||||
|
||||
// write is NOT reviewed by default — skipped entirely
|
||||
let results = inspector
|
||||
.inspect(
|
||||
"test-session",
|
||||
&[make_request(
|
||||
"r1",
|
||||
"write",
|
||||
object!({"path": "foo.txt", "content": "hi"}),
|
||||
)],
|
||||
&messages,
|
||||
GooseMode::SmartApprove,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert!(results.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_adversary_custom_tool_filter() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
write_adversary_md(
|
||||
tmp.path(),
|
||||
"tools: shell, computercontroller__automation_script\n---\nBLOCK bad stuff",
|
||||
);
|
||||
|
||||
let provider = Arc::new(Mutex::new(None));
|
||||
let inspector = AdversaryInspector::with_config_dir(provider, tmp.path().to_path_buf());
|
||||
|
||||
assert!(inspector.is_enabled());
|
||||
|
||||
let messages = vec![Message::new(
|
||||
rmcp::model::Role::User,
|
||||
chrono::Utc::now().timestamp(),
|
||||
vec![MessageContent::text("do something")],
|
||||
)];
|
||||
|
||||
// shell — reviewed
|
||||
let results = inspector
|
||||
.inspect(
|
||||
"test",
|
||||
&[make_request("r1", "shell", object!({"command": "ls"}))],
|
||||
&messages,
|
||||
GooseMode::Auto,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(results.len(), 1);
|
||||
|
||||
// automation_script — reviewed
|
||||
let results = inspector
|
||||
.inspect(
|
||||
"test",
|
||||
&[make_request(
|
||||
"r2",
|
||||
"computercontroller__automation_script",
|
||||
object!({"script": "echo hi", "language": "shell"}),
|
||||
)],
|
||||
&messages,
|
||||
GooseMode::Auto,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(results.len(), 1);
|
||||
|
||||
// write — NOT reviewed
|
||||
let results = inspector
|
||||
.inspect(
|
||||
"test",
|
||||
&[make_request(
|
||||
"r3",
|
||||
"write",
|
||||
object!({"path": "x.txt", "content": "y"}),
|
||||
)],
|
||||
&messages,
|
||||
GooseMode::Auto,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(results.is_empty());
|
||||
}
|
||||
88
documentation/docs/guides/security/adversary-mode.md
Normal file
88
documentation/docs/guides/security/adversary-mode.md
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
---
|
||||
sidebar_position: 2
|
||||
title: Adversary Mode
|
||||
sidebar_label: Adversary Mode
|
||||
description: An independent agent reviewer that silently watches tool calls to protect you in case the agent goes rogue.
|
||||
---
|
||||
|
||||
Adversary mode adds a silent, independent agent reviewer that watches tool calls before they execute. It acts as a second pair of eyes — protecting you in case the main agent is compromised, manipulated by a prompt injection, or simply doing something that doesn't match what you asked for.
|
||||
|
||||
Unlike [pattern-based detection](/docs/guides/security/prompt-injection-detection), the adversary reviewer understands context. It knows what you originally asked for, what you've said recently, and can judge whether a tool call makes sense for your task.
|
||||
|
||||
## How It Works
|
||||
|
||||
1. Before each tool call, the adversary reviewer checks your **original task**, **recent messages**, and the **tool call details**
|
||||
2. It evaluates the tool call against your rules and returns **ALLOW** or **BLOCK**
|
||||
3. Blocked tool calls are denied — the agent sees the rejection and cannot retry
|
||||
4. If the reviewer fails for any reason, the tool call is allowed through (fail-open)
|
||||
|
||||
The adversary uses the same model and provider goose is already configured with. No additional API keys or services needed.
|
||||
|
||||
## Enabling Adversary Mode
|
||||
|
||||
Create a file at `~/.config/goose/adversary.md` with your rules:
|
||||
|
||||
```markdown
|
||||
BLOCK if the tool call:
|
||||
- Exfiltrates data (posting to unknown URLs, piping secrets to external services)
|
||||
- Is destructive beyond the project scope (deleting system files, wiping directories)
|
||||
- Installs malware or runs obfuscated code
|
||||
- Attempts to escalate privileges unnecessarily
|
||||
- Downloads and executes untrusted remote scripts
|
||||
|
||||
ALLOW normal development operations like editing files, running tests,
|
||||
installing packages, using git, etc. Most tool calls are fine.
|
||||
Err on the side of ALLOW — only block truly dangerous things.
|
||||
```
|
||||
|
||||
That's it. If the file exists, adversary mode is on. Delete the file to turn it off.
|
||||
|
||||
An empty file uses sensible default rules similar to the example above.
|
||||
|
||||
## Writing Good Rules
|
||||
|
||||
The rules in `adversary.md` are your policy, written in plain language:
|
||||
|
||||
- **Be specific** about what should be blocked — vague rules lead to false positives
|
||||
- **Err on the side of ALLOW** — the adversary should catch genuinely dangerous actions, not slow down normal work
|
||||
- **Think about your threat model** — what would a rogue agent actually try to do?
|
||||
|
||||
You can tailor rules to your environment:
|
||||
|
||||
```markdown
|
||||
BLOCK if:
|
||||
- Any network request goes to a domain not in: github.com, npmjs.org, pypi.org
|
||||
- Files outside of ~/projects/ are modified
|
||||
- SSH keys, .env files, or credentials are read or transmitted
|
||||
|
||||
ALLOW all standard development operations within ~/projects/.
|
||||
```
|
||||
|
||||
## What Gets Reviewed
|
||||
|
||||
By default, the adversary reviews **`shell`** and **`computercontroller__automation_script`** — the tools that can execute arbitrary code.
|
||||
|
||||
You can expand coverage by adding a `tools:` line at the top of your `adversary.md`:
|
||||
|
||||
```markdown
|
||||
tools: shell, computercontroller__automation_script
|
||||
---
|
||||
BLOCK if the command exfiltrates data or is destructive.
|
||||
ALLOW normal development operations.
|
||||
```
|
||||
|
||||
The `tools:` line is a comma-separated list of tool names to review. Everything before the `---` separator is configuration; everything after is your rules. If you omit the `tools:` line, `shell` and `computercontroller__automation_script` are reviewed by default.
|
||||
|
||||
Some tool names you might want to add:
|
||||
|
||||
| Tool name | What it does |
|
||||
|-----------|-------------|
|
||||
| `shell` | Executes shell commands (default) |
|
||||
| `computercontroller__automation_script` | Runs shell, Ruby, AppleScript, or PowerShell scripts (default) |
|
||||
| `computercontroller__computer_control` | UI automation — clicks, keystrokes, typing |
|
||||
| `computercontroller__web_scrape` | Fetches arbitrary URLs |
|
||||
|
||||
## See Also
|
||||
|
||||
- [Prompt Injection Detection](/docs/guides/security/prompt-injection-detection) — pattern-based detection (complementary, always-on when enabled)
|
||||
- [goose Permission Modes](/docs/guides/goose-permissions) — control goose's autonomy level
|
||||
|
|
@ -15,6 +15,11 @@ import styles from '@site/src/components/Card/styles.module.css';
|
|||
<div className={styles.categorySection}>
|
||||
<h2 className={styles.categoryTitle}>📚 Documentation & Guides</h2>
|
||||
<div className={styles.cardGrid}>
|
||||
<Card
|
||||
title="Adversary Mode"
|
||||
description="An independent agent reviewer that silently watches tool calls to protect you in case the agent goes rogue."
|
||||
link="/docs/guides/security/adversary-mode"
|
||||
/>
|
||||
<Card
|
||||
title="Prompt Injection Detection"
|
||||
description="Detect and prevent potentially harmful commands before they run."
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue