From d066a736097666dbbb3dafeca4801088cff92010 Mon Sep 17 00:00:00 2001 From: Oleksiy Syvokon Date: Sun, 24 May 2026 21:00:41 +0300 Subject: [PATCH] ep: Avoid service files as a split point (#57603) Make up to ten retries to avoid placing cursor on files that are typically machine generated (like Cargo.lock) Release Notes: - N/A --- .../edit_prediction_cli/src/split_commit.rs | 125 +++++++++++++++++- 1 file changed, 124 insertions(+), 1 deletion(-) diff --git a/crates/edit_prediction_cli/src/split_commit.rs b/crates/edit_prediction_cli/src/split_commit.rs index 20c23d7a023..df109437586 100644 --- a/crates/edit_prediction_cli/src/split_commit.rs +++ b/crates/edit_prediction_cli/src/split_commit.rs @@ -35,6 +35,8 @@ use std::io::{self, Write}; use std::path::Path; use std::path::PathBuf; +const MAX_SPLIT_POINT_SAMPLING_ATTEMPTS: usize = 10; + /// `ep split-commit` CLI args. #[derive(Debug, Args, Clone)] pub struct SplitCommitArgs { @@ -112,6 +114,89 @@ fn parse_split_point(value: &str) -> Option { } } +fn is_service_file(path: &str) -> bool { + let path = path.trim(); + let path = path + .strip_prefix("a/") + .or_else(|| path.strip_prefix("b/")) + .unwrap_or(path) + .trim_start_matches("./"); + + if path.is_empty() || path == "/dev/null" { + return true; + } + + let file_name = path.rsplit('/').next().unwrap_or(path); + if matches!( + file_name, + "package.json" + | "package-lock.json" + | "pnpm-lock.yaml" + | "Cargo.lock" + | "yarn.lock" + | "bun.lock" + | "bun.lockb" + | "go.sum" + | "composer.lock" + | "Gemfile.lock" + | "Pipfile.lock" + | "poetry.lock" + | "uv.lock" + | ".gitlab-ci.yml" + | ".travis.yml" + | "azure-pipelines.yml" + | "Jenkinsfile" + ) { + return true; + } + + if file_name.ends_with(".min.js") + || file_name.ends_with(".bundle.js") + || file_name.contains(".generated.") + || file_name.ends_with(".pb.go") + { + return true; + } + + if path == ".github/workflows" + || path.starts_with(".github/workflows/") + || path == ".circleci" + || path.starts_with(".circleci/") + { + return true; + } + + path.split('/').any(|component| { + matches!( + component, + "dist" | "build" | "coverage" | "node_modules" | "vendor" + ) + }) +} + +fn edit_starts_on_service_file(patch: &Patch, split_pos: usize) -> bool { + locate_edited_line(patch, split_pos as isize) + .is_some_and(|edit_location| is_service_file(&edit_location.filename)) +} + +fn sample_split_point(patch: &Patch, rng: &mut dyn rand::RngCore) -> usize { + let stats = patch.stats(); + let num_edits = stats.added + stats.removed; + if num_edits == 0 { + return 0; + } + + let mut split = rng.random_range(1..=num_edits); + for _ in 1..MAX_SPLIT_POINT_SAMPLING_ATTEMPTS { + if !edit_starts_on_service_file(patch, split) { + break; + } + split = rng.random_range(1..=num_edits); + } + + split +} + /// Entry point for the `ep split-commit` subcommand. /// /// This runs synchronously and outputs JSON Lines (one output per input line). @@ -303,7 +388,7 @@ pub fn generate_evaluation_example_from_ordered_commit( anyhow::ensure!(num_edits != 0, "no edits found in commit"); let split = match split_point { - None => rng.random_range(1..=num_edits), + None => sample_split_point(&patch, rng.as_mut()), Some(SplitPoint::Fraction(f)) => { let v = (f * num_edits as f64).floor() as usize; v.min(num_edits) @@ -1573,6 +1658,44 @@ index 123..456 789 assert!(!case.edit_history.contains("Date:")); } + #[test] + fn test_service_file_detection() { + assert!(is_service_file("package.json")); + assert!(is_service_file("frontend/yarn.lock")); + assert!(is_service_file("a/src/generated/types.pb.go")); + assert!(is_service_file("b/.github/workflows/ci.yml")); + assert!(is_service_file("web/node_modules/pkg/index.js")); + assert!(is_service_file("dist/app.bundle.js")); + + assert!(!is_service_file("src/main.rs")); + assert!(!is_service_file("src/build.rs")); + assert!(!is_service_file("Cargo.toml")); + } + + #[test] + fn test_edit_starts_on_service_file() { + let commit = r#"--- a/src/lib.rs ++++ b/src/lib.rs +@@ -1,1 +1,2 @@ + fn lib() {} ++pub fn added() {} +--- a/package-lock.json ++++ b/package-lock.json +@@ -1,1 +1,2 @@ + {} ++{"lockfileVersion": 3} +--- a/src/main.rs ++++ b/src/main.rs +@@ -1,1 +1,2 @@ + fn main() {} ++println!("hello"); +"#; + let patch = Patch::parse_unified_diff(commit); + + assert!(edit_starts_on_service_file(&patch, 1)); + assert!(!edit_starts_on_service_file(&patch, 2)); + } + #[test] fn test_position_weight() { // High weight positions (natural pause points)