ep: Move pure diffing functions to zeta_prompt:udiff (#52959)

This PR mostly moves some code around. It also adds a high-level
`format_expected_output` function that routes patch formatting to
specific prompt formats. This was `zeta_prompt` can format
`expected_output` for training.

Keeping everything prompt-related in a "pure" module (with no heavy
dependencies) makes it easier to write bindings.


Release Notes:

- N/A
This commit is contained in:
Oleksiy Syvokon 2026-04-02 17:00:15 +03:00 committed by GitHub
parent f8d646794d
commit 66ea4b89af
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 1327 additions and 1146 deletions

File diff suppressed because it is too large Load diff

View file

@ -6,11 +6,11 @@ use crate::{
retrieve_context::run_context_retrieval,
};
use anyhow::{Context as _, Result, anyhow};
use edit_prediction::udiff;
use gpui::AsyncApp;
use similar::DiffableStr;
use std::ops::Range;
use std::sync::Arc;
use zeta_prompt::udiff;
use zeta_prompt::{
ZetaFormat, encode_patch_as_output_for_format, excerpt_range_for_format, format_zeta_prompt,
multi_region, output_end_marker_for_format, resolve_cursor_region,

View file

@ -2,8 +2,8 @@ use std::ops::Range;
use std::path::Path;
use std::sync::Arc;
use edit_prediction::udiff::apply_diff_to_string;
use language::{char_diff, text_diff};
use zeta_prompt::udiff::apply_diff_to_string;
use zeta_prompt::ZetaPromptInput;
@ -653,9 +653,9 @@ pub fn compute_prediction_reversal_ratio(
#[cfg(test)]
mod tests {
use super::*;
use edit_prediction::udiff::apply_diff_to_string;
use indoc::indoc;
use zeta_prompt::ExcerptRanges;
use zeta_prompt::udiff::apply_diff_to_string;
fn make_test_prompt_inputs(
content: &str,

View file

@ -10,13 +10,13 @@ use crate::{
reversal_tracking,
};
use anyhow::Context as _;
use edit_prediction::udiff::{apply_diff_to_string, apply_diff_to_string_with_hunk_offset};
use gpui::AsyncApp;
use serde::Serialize;
use std::fs::File;
use std::io::BufWriter;
use std::path::Path;
use std::sync::Arc;
use zeta_prompt::udiff::{apply_diff_to_string, apply_diff_to_string_with_hunk_offset};
pub async fn run_scoring(
example: &mut Example,

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,6 @@
pub mod excerpt_ranges;
pub mod multi_region;
pub mod udiff;
use anyhow::{Result, anyhow};
use serde::{Deserialize, Serialize};
@ -819,6 +820,113 @@ pub fn encode_patch_as_output_for_format(
}
}
/// Given a `ZetaPromptInput`, a format, and a patch (with cursor already
/// extracted), produce the expected model output string for training.
pub fn format_expected_output(
input: &ZetaPromptInput,
format: ZetaFormat,
patch: &str,
cursor_offset: Option<usize>,
) -> Result<String> {
let (context, editable_range, _, _) = resolve_cursor_region(input, format);
let mut old_editable = context[editable_range].to_string();
if !old_editable.is_empty() && !old_editable.ends_with('\n') {
old_editable.push('\n');
}
// Formats with their own output encoding (hashline, variable-edit,
// multi-region empty patches) are handled here.
if let Some(output) =
encode_patch_as_output_for_format(format, &old_editable, patch, cursor_offset)?
{
return Ok(output);
}
let empty_patch = patch.lines().count() <= 3;
match format {
// Multi-region formats: non-empty patches need diff application
// then marker-span encoding.
ZetaFormat::V0316SeedMultiRegions => {
let (new_editable, first_hunk_offset) =
udiff::apply_diff_to_string_with_hunk_offset(patch, &old_editable)?;
let cursor_in_new = cursor_in_new_text(cursor_offset, first_hunk_offset, &new_editable);
multi_region::encode_from_old_and_new_v0316(
&old_editable,
&new_editable,
cursor_in_new,
CURSOR_MARKER,
multi_region::V0316_END_MARKER,
)
}
ZetaFormat::V0318SeedMultiRegions => {
let (new_editable, first_hunk_offset) =
udiff::apply_diff_to_string_with_hunk_offset(patch, &old_editable)?;
let cursor_in_new = cursor_in_new_text(cursor_offset, first_hunk_offset, &new_editable);
multi_region::encode_from_old_and_new_v0318(
&old_editable,
&new_editable,
cursor_in_new,
CURSOR_MARKER,
multi_region::V0318_END_MARKER,
)
}
ZetaFormat::V0317SeedMultiRegions => {
let (new_editable, first_hunk_offset) =
udiff::apply_diff_to_string_with_hunk_offset(patch, &old_editable)?;
let cursor_in_new = cursor_in_new_text(cursor_offset, first_hunk_offset, &new_editable);
multi_region::encode_from_old_and_new_v0317(
&old_editable,
&new_editable,
cursor_in_new,
CURSOR_MARKER,
multi_region::V0317_END_MARKER,
)
}
// V0131-style formats and fallback: produce new editable text with
// cursor marker inserted, followed by the end marker.
_ => {
let (mut result, first_hunk_offset) = if empty_patch {
(old_editable.clone(), None)
} else {
udiff::apply_diff_to_string_with_hunk_offset(patch, &old_editable)?
};
if let Some(cursor) = cursor_offset {
let hunk_start = if !empty_patch {
first_hunk_offset.unwrap_or(0)
} else {
0
};
let offset = (hunk_start + cursor).min(result.len());
result.insert_str(offset, CURSOR_MARKER);
}
if !result.is_empty() && !result.ends_with('\n') {
result.push('\n');
}
if let Some(end_marker) = output_end_marker_for_format(format) {
result.push_str(end_marker);
}
Ok(result)
}
}
}
/// Compute the cursor position within the new text after diff application.
fn cursor_in_new_text(
cursor_offset: Option<usize>,
first_hunk_offset: Option<usize>,
new_text: &str,
) -> Option<usize> {
cursor_offset.map(|cursor| {
let hunk_start = first_hunk_offset.unwrap_or(0);
(hunk_start + cursor).min(new_text.len())
})
}
pub struct ParsedOutput {
/// Text that should replace the editable region
pub new_editable_region: String,