agent: Cleanup edit_file evals (#55750)

- Rename `streaming_edit_file` to `edit_file`
- Remove workaround for replacing old edit tool with streaming edit file
tool

Release Notes:

- N/A
This commit is contained in:
Bennet Bo Fenner 2026-05-05 11:58:05 +02:00 committed by GitHub
parent f482f9e18c
commit d6cc34c167
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 14 additions and 38 deletions

View file

@ -1,2 +1,2 @@
#[cfg(all(test, feature = "unit-eval"))]
mod streaming_edit_file;
mod edit_file;

View file

@ -15,9 +15,8 @@ use language::language_settings::FormatOnSave;
use language_model::{
LanguageModel, LanguageModelCompletionError, LanguageModelCompletionEvent,
LanguageModelRegistry, LanguageModelRequest, LanguageModelRequestMessage,
LanguageModelRequestTool, LanguageModelToolResult, LanguageModelToolResultContent,
LanguageModelToolSchemaFormat, LanguageModelToolUse, LanguageModelToolUseId, MessageContent,
Role, SelectedModel,
LanguageModelToolResult, LanguageModelToolResultContent, LanguageModelToolUse,
LanguageModelToolUseId, MessageContent, Role, SelectedModel,
};
use project::Project;
use prompt_store::{ProjectContext, WorktreeContext};
@ -218,12 +217,12 @@ impl EvalAssertion {
}
#[derive(Clone)]
struct StreamingEditEvalOutput {
struct EditEvalOutput {
sample: EvalSample,
assertion: EvalAssertionOutcome,
}
impl Display for StreamingEditEvalOutput {
impl Display for EditEvalOutput {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
writeln!(f, "Score: {:?}", self.assertion.score)?;
if let Some(message) = self.assertion.message.as_ref() {
@ -241,7 +240,7 @@ struct EvalAssertionOutcome {
message: Option<String>,
}
struct StreamingEditToolTest {
struct EditToolTest {
fs: Arc<FakeFs>,
project: Entity<Project>,
model: Arc<dyn LanguageModel>,
@ -249,7 +248,7 @@ struct StreamingEditToolTest {
model_thinking_effort: Option<String>,
}
impl StreamingEditToolTest {
impl EditToolTest {
async fn new(cx: &mut TestAppContext) -> Self {
cx.executor().allow_parking();
@ -349,29 +348,7 @@ impl StreamingEditToolTest {
}))
}
/// Build the tool definitions for the model, replacing `edit_file` with the
/// streaming edit file tool schema. In production the streaming tool is
/// exposed under the name `"edit_file"` (see `Thread::enabled_tools`), so
/// the model has never seen the name `"streaming_edit_file"`.
fn build_tools() -> Vec<LanguageModelRequestTool> {
let mut tools: Vec<LanguageModelRequestTool> = crate::built_in_tools()
.filter(|tool| tool.name != EditFileTool::NAME)
.collect();
tools.push(LanguageModelRequestTool {
name: EditFileTool::NAME.to_string(),
description: EditFileTool::description().to_string(),
input_schema: EditFileTool::input_schema(LanguageModelToolSchemaFormat::JsonSchema)
.to_value(),
use_input_streaming: EditFileTool::supports_input_streaming(),
});
tools
}
async fn eval(
&self,
mut eval: EvalInput,
cx: &mut TestAppContext,
) -> Result<StreamingEditEvalOutput> {
async fn eval(&self, mut eval: EvalInput, cx: &mut TestAppContext) -> Result<EditEvalOutput> {
eval.conversation
.last_mut()
.context("Conversation must not be empty")?
@ -391,7 +368,7 @@ impl StreamingEditToolTest {
cx.run_until_parked();
}
let tools = Self::build_tools();
let tools = crate::built_in_tools().collect::<Vec<_>>();
let system_prompt = {
let worktrees = vec![WorktreeContext {
@ -440,7 +417,7 @@ impl StreamingEditToolTest {
};
// The model will call the tool as "edit_file" (the production-visible
// name), but the schema is from StreamingEditFileTool.
// name), but the schema is from EditFileTool.
let tool_input =
retry_on_rate_limit(async || self.extract_tool_use(request.clone(), cx).await).await?;
@ -505,12 +482,11 @@ impl StreamingEditToolTest {
.run(&sample, self.judge_model.clone(), cx)
.await?;
Ok(StreamingEditEvalOutput { assertion, sample })
Ok(EditEvalOutput { assertion, sample })
}
/// Stream the model completion and extract the first complete tool use
/// whose name matches `EditFileTool::NAME` (the production-visible name
/// for the streaming edit tool), parsed as `StreamingEditFileToolInput`.
/// whose name matches `EditFileTool::NAME`, parsed as `EditFileToolInput`.
async fn extract_tool_use(
&self,
request: LanguageModelRequest,
@ -538,7 +514,7 @@ impl StreamingEditToolTest {
&& tool_use.name.as_ref() == EditFileTool::NAME =>
{
let input: EditFileToolInput = serde_json::from_value(tool_use.input)
.context("Failed to parse tool input as StreamingEditFileToolInput")?;
.context("Failed to parse tool input as EditFileToolInput")?;
return Ok(input);
}
Ok(LanguageModelCompletionEvent::Text(text)) => {
@ -590,7 +566,7 @@ fn run_eval(eval: EvalInput) -> eval_utils::EvalOutput<()> {
let mut cx = TestAppContext::build(dispatcher, None);
let foreground_executor = cx.foreground_executor().clone();
let result = foreground_executor.block_test(async {
let test = StreamingEditToolTest::new(&mut cx).await;
let test = EditToolTest::new(&mut cx).await;
let result = test.eval(eval, &mut cx).await;
drop(test);
cx.run_until_parked();