mirror of
https://github.com/zed-industries/zed.git
synced 2026-05-30 20:24:08 +00:00
ep: Move pure diffing functions to zeta_prompt:udiff (#52959)
This PR mostly moves some code around. It also adds a high-level `format_expected_output` function that routes patch formatting to specific prompt formats. This was `zeta_prompt` can format `expected_output` for training. Keeping everything prompt-related in a "pure" module (with no heavy dependencies) makes it easier to write bindings. Release Notes: - N/A
This commit is contained in:
parent
f8d646794d
commit
66ea4b89af
6 changed files with 1327 additions and 1146 deletions
File diff suppressed because it is too large
Load diff
|
|
@ -6,11 +6,11 @@ use crate::{
|
|||
retrieve_context::run_context_retrieval,
|
||||
};
|
||||
use anyhow::{Context as _, Result, anyhow};
|
||||
use edit_prediction::udiff;
|
||||
use gpui::AsyncApp;
|
||||
use similar::DiffableStr;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use zeta_prompt::udiff;
|
||||
use zeta_prompt::{
|
||||
ZetaFormat, encode_patch_as_output_for_format, excerpt_range_for_format, format_zeta_prompt,
|
||||
multi_region, output_end_marker_for_format, resolve_cursor_region,
|
||||
|
|
|
|||
|
|
@ -2,8 +2,8 @@ use std::ops::Range;
|
|||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use edit_prediction::udiff::apply_diff_to_string;
|
||||
use language::{char_diff, text_diff};
|
||||
use zeta_prompt::udiff::apply_diff_to_string;
|
||||
|
||||
use zeta_prompt::ZetaPromptInput;
|
||||
|
||||
|
|
@ -653,9 +653,9 @@ pub fn compute_prediction_reversal_ratio(
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use edit_prediction::udiff::apply_diff_to_string;
|
||||
use indoc::indoc;
|
||||
use zeta_prompt::ExcerptRanges;
|
||||
use zeta_prompt::udiff::apply_diff_to_string;
|
||||
|
||||
fn make_test_prompt_inputs(
|
||||
content: &str,
|
||||
|
|
|
|||
|
|
@ -10,13 +10,13 @@ use crate::{
|
|||
reversal_tracking,
|
||||
};
|
||||
use anyhow::Context as _;
|
||||
use edit_prediction::udiff::{apply_diff_to_string, apply_diff_to_string_with_hunk_offset};
|
||||
use gpui::AsyncApp;
|
||||
use serde::Serialize;
|
||||
use std::fs::File;
|
||||
use std::io::BufWriter;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use zeta_prompt::udiff::{apply_diff_to_string, apply_diff_to_string_with_hunk_offset};
|
||||
|
||||
pub async fn run_scoring(
|
||||
example: &mut Example,
|
||||
|
|
|
|||
1206
crates/zeta_prompt/src/udiff.rs
Normal file
1206
crates/zeta_prompt/src/udiff.rs
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -1,5 +1,6 @@
|
|||
pub mod excerpt_ranges;
|
||||
pub mod multi_region;
|
||||
pub mod udiff;
|
||||
|
||||
use anyhow::{Result, anyhow};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
|
@ -819,6 +820,113 @@ pub fn encode_patch_as_output_for_format(
|
|||
}
|
||||
}
|
||||
|
||||
/// Given a `ZetaPromptInput`, a format, and a patch (with cursor already
|
||||
/// extracted), produce the expected model output string for training.
|
||||
pub fn format_expected_output(
|
||||
input: &ZetaPromptInput,
|
||||
format: ZetaFormat,
|
||||
patch: &str,
|
||||
cursor_offset: Option<usize>,
|
||||
) -> Result<String> {
|
||||
let (context, editable_range, _, _) = resolve_cursor_region(input, format);
|
||||
let mut old_editable = context[editable_range].to_string();
|
||||
if !old_editable.is_empty() && !old_editable.ends_with('\n') {
|
||||
old_editable.push('\n');
|
||||
}
|
||||
|
||||
// Formats with their own output encoding (hashline, variable-edit,
|
||||
// multi-region empty patches) are handled here.
|
||||
if let Some(output) =
|
||||
encode_patch_as_output_for_format(format, &old_editable, patch, cursor_offset)?
|
||||
{
|
||||
return Ok(output);
|
||||
}
|
||||
|
||||
let empty_patch = patch.lines().count() <= 3;
|
||||
|
||||
match format {
|
||||
// Multi-region formats: non-empty patches need diff application
|
||||
// then marker-span encoding.
|
||||
ZetaFormat::V0316SeedMultiRegions => {
|
||||
let (new_editable, first_hunk_offset) =
|
||||
udiff::apply_diff_to_string_with_hunk_offset(patch, &old_editable)?;
|
||||
let cursor_in_new = cursor_in_new_text(cursor_offset, first_hunk_offset, &new_editable);
|
||||
multi_region::encode_from_old_and_new_v0316(
|
||||
&old_editable,
|
||||
&new_editable,
|
||||
cursor_in_new,
|
||||
CURSOR_MARKER,
|
||||
multi_region::V0316_END_MARKER,
|
||||
)
|
||||
}
|
||||
ZetaFormat::V0318SeedMultiRegions => {
|
||||
let (new_editable, first_hunk_offset) =
|
||||
udiff::apply_diff_to_string_with_hunk_offset(patch, &old_editable)?;
|
||||
let cursor_in_new = cursor_in_new_text(cursor_offset, first_hunk_offset, &new_editable);
|
||||
multi_region::encode_from_old_and_new_v0318(
|
||||
&old_editable,
|
||||
&new_editable,
|
||||
cursor_in_new,
|
||||
CURSOR_MARKER,
|
||||
multi_region::V0318_END_MARKER,
|
||||
)
|
||||
}
|
||||
ZetaFormat::V0317SeedMultiRegions => {
|
||||
let (new_editable, first_hunk_offset) =
|
||||
udiff::apply_diff_to_string_with_hunk_offset(patch, &old_editable)?;
|
||||
let cursor_in_new = cursor_in_new_text(cursor_offset, first_hunk_offset, &new_editable);
|
||||
multi_region::encode_from_old_and_new_v0317(
|
||||
&old_editable,
|
||||
&new_editable,
|
||||
cursor_in_new,
|
||||
CURSOR_MARKER,
|
||||
multi_region::V0317_END_MARKER,
|
||||
)
|
||||
}
|
||||
// V0131-style formats and fallback: produce new editable text with
|
||||
// cursor marker inserted, followed by the end marker.
|
||||
_ => {
|
||||
let (mut result, first_hunk_offset) = if empty_patch {
|
||||
(old_editable.clone(), None)
|
||||
} else {
|
||||
udiff::apply_diff_to_string_with_hunk_offset(patch, &old_editable)?
|
||||
};
|
||||
|
||||
if let Some(cursor) = cursor_offset {
|
||||
let hunk_start = if !empty_patch {
|
||||
first_hunk_offset.unwrap_or(0)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
let offset = (hunk_start + cursor).min(result.len());
|
||||
result.insert_str(offset, CURSOR_MARKER);
|
||||
}
|
||||
|
||||
if !result.is_empty() && !result.ends_with('\n') {
|
||||
result.push('\n');
|
||||
}
|
||||
|
||||
if let Some(end_marker) = output_end_marker_for_format(format) {
|
||||
result.push_str(end_marker);
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute the cursor position within the new text after diff application.
|
||||
fn cursor_in_new_text(
|
||||
cursor_offset: Option<usize>,
|
||||
first_hunk_offset: Option<usize>,
|
||||
new_text: &str,
|
||||
) -> Option<usize> {
|
||||
cursor_offset.map(|cursor| {
|
||||
let hunk_start = first_hunk_offset.unwrap_or(0);
|
||||
(hunk_start + cursor).min(new_text.len())
|
||||
})
|
||||
}
|
||||
|
||||
pub struct ParsedOutput {
|
||||
/// Text that should replace the editable region
|
||||
pub new_editable_region: String,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue