unsloth/scripts/verify_comment_only_diff.py
Daniel Han 4192fe6ebe
studio: drop unused max_grad_value schema + route plumbing (#5424)
* studio: drop unused max_grad_value schema + route plumbing

The MLX worker hardcodes max_grad_value to 5.0 after PR #5340. The
schema field, frontend payload type, route forwarder, and start_training
kwarg threading were all left in place as a transitional buffer for old
clients. The field is now genuinely unused everywhere except inside the
MLX worker, so the schema, route forwarder, and config-build entries can
go. Pydantic still tolerates older clients that send max_grad_value
because TrainingStartRequest's model_config defaults to extra=ignore.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2026-05-14 05:43:58 -07:00

247 lines
8.3 KiB
Python

# Unsloth - 2x faster, 60% less VRAM LLM training and finetuning
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
"""Deterministic comment / docstring-only verifier.
Compares a list of changed files between two git refs and reports whether
each diff is strictly comments / docstrings (Python) or comments
(YAML / GitHub Actions). Useful for gating a "comment trim" /
"docstring refactor" PR against accidental code drift.
Per .py file: parse both revs into AST, strip module / class / function
docstrings, then compare ast.unparse output. Pure Python comments are
discarded by the parser by construction, so any post-strip diff is real
code. Per .yml file: yaml.safe_load both sides and compare the parsed
Python object; if scalar values differ, also strip shell comments inside
``run: |`` block bodies before comparing. Exit code 0 = all OK, 1 = at
least one file has a real (non-comment) diff or an error.
Usage:
python scripts/verify_comment_only_diff.py [--base REF] [--head REF] path ...
Defaults: --base origin/main, --head HEAD. Paths are repo-relative.
Example:
git diff --name-only origin/main..HEAD \\
| xargs python scripts/verify_comment_only_diff.py --base origin/main
"""
from __future__ import annotations
import argparse
import ast
import difflib
import subprocess
import sys
from typing import Any
import yaml
def _git_show(rev: str, path: str) -> str:
return subprocess.check_output(
["git", "show", f"{rev}:{path}"],
text = True,
stderr = subprocess.DEVNULL,
)
def _strip_docstrings(tree: ast.AST) -> ast.AST:
"""Remove every string-literal docstring (Module / FunctionDef /
AsyncFunctionDef / ClassDef). Empty body becomes ``pass`` so
ast.unparse stays valid."""
for node in ast.walk(tree):
if isinstance(
node,
(ast.Module, ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef),
):
body = getattr(node, "body", None)
if not body:
continue
first = body[0]
if (
isinstance(first, ast.Expr)
and isinstance(first.value, ast.Constant)
and isinstance(first.value.value, str)
):
node.body = body[1:]
if not node.body:
node.body = [ast.Pass()]
return tree
def _normalize_py(src: str) -> str:
tree = ast.parse(src)
tree = _strip_docstrings(tree)
return ast.unparse(tree)
def _strip_shell_comments(s: str) -> str:
"""Strip pure-comment lines and inline trailing comments from a shell
snippet, then collapse runs of blank lines. Heuristic only: leaves a
line untouched if it has an odd quote count (open string)."""
out = []
for line in s.splitlines():
stripped = line.lstrip()
if stripped.startswith("#"):
continue
has_single = line.count("'") % 2 == 0
has_double = line.count('"') % 2 == 0
if has_single and has_double:
idx = line.find(" #")
if idx >= 0:
line = line[:idx].rstrip()
out.append(line)
norm = []
prev_blank = False
for line in out:
if line.strip() == "":
if prev_blank:
continue
prev_blank = True
else:
prev_blank = False
norm.append(line)
return "\n".join(norm).strip()
def _normalize_yaml_run_strings(obj: Any) -> Any:
"""Walk the parsed YAML object; for any multi-line string (i.e. a
``run: |`` script body), strip shell comments. Returns a normalised
copy."""
if isinstance(obj, dict):
return {k: _normalize_yaml_run_strings(v) for k, v in obj.items()}
if isinstance(obj, list):
return [_normalize_yaml_run_strings(x) for x in obj]
if isinstance(obj, str) and "\n" in obj:
return _strip_shell_comments(obj)
return obj
def _walk_yaml_diff(b: Any, a: Any, prefix: str = "") -> None:
"""Print a path-keyed summary of the first structural / scalar diff."""
if type(b) is not type(a):
print(
f" type-diff at {prefix or '/'}: "
f"{type(b).__name__} -> {type(a).__name__}",
)
return
if isinstance(b, dict):
keys = sorted((set(b.keys()) | set(a.keys())), key = lambda x: str(x))
for k in keys:
if k not in b:
print(f" added key {prefix}/{k}")
elif k not in a:
print(f" removed key {prefix}/{k}")
else:
_walk_yaml_diff(b[k], a[k], f"{prefix}/{k}")
elif isinstance(b, list):
if len(b) != len(a):
print(
f" list len at {prefix or '/'}: " f"{len(b)} -> {len(a)}",
)
for i, (bi, ai) in enumerate(zip(b, a)):
_walk_yaml_diff(bi, ai, f"{prefix}[{i}]")
elif b != a:
bs = repr(b)[:300]
as_ = repr(a)[:300]
print(f" scalar at {prefix or '/'}:")
print(f" before: {bs}")
print(f" after: {as_}")
def _verify_python(path: str, before: str, after: str) -> bool:
try:
norm_before = _normalize_py(before)
norm_after = _normalize_py(after)
except SyntaxError as exc:
print(f"FAIL {path}: SyntaxError parsing -- {exc}")
return False
if norm_before == norm_after:
print(f"OK {path} (AST identical after docstring strip)")
return True
diff = list(
difflib.unified_diff(
norm_before.splitlines(),
norm_after.splitlines(),
fromfile = f"{path}@before",
tofile = f"{path}@after",
n = 2,
)
)
print(f"FAIL {path}: AST differs after docstring strip:")
for line in diff[:40]:
print(f" {line}")
return False
def _verify_yaml(path: str, before: str, after: str) -> bool:
try:
raw_before = yaml.safe_load(before)
raw_after = yaml.safe_load(after)
except yaml.YAMLError as exc:
print(f"FAIL {path}: YAML parse error -- {exc}")
return False
if raw_before == raw_after:
print(f"OK {path} (YAML parsed object identical)")
return True
norm_before = _normalize_yaml_run_strings(raw_before)
norm_after = _normalize_yaml_run_strings(raw_after)
if norm_before == norm_after:
print(
f"OK {path} (YAML parsed object identical after "
f"stripping shell comments from run: bodies)",
)
return True
print(
f"FAIL {path}: YAML parsed objects still differ after stripping "
f"shell comments from `run:` bodies.",
)
_walk_yaml_diff(norm_before, norm_after)
return False
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(
description = "Verify each path's diff between BASE and HEAD is "
"strictly comments / docstrings.",
)
parser.add_argument("--base", default = "origin/main", help = "base git ref")
parser.add_argument("--head", default = "HEAD", help = "head git ref")
parser.add_argument("paths", nargs = "+", help = "repo-relative paths")
args = parser.parse_args(argv)
rc = 0
print(f"Comparing {len(args.paths)} files: {args.base} vs {args.head}\n")
for path in args.paths:
try:
before = _git_show(args.base, path)
after = _git_show(args.head, path)
except subprocess.CalledProcessError as exc:
print(f"SKIP {path}: {exc}")
continue
if path.endswith(".py"):
if not _verify_python(path, before, after):
rc = 1
elif path.endswith((".yml", ".yaml")):
if not _verify_yaml(path, before, after):
rc = 1
else:
print(f"NOTE {path}: not .py or .yaml -- skipped automated check.")
return rc
if __name__ == "__main__":
sys.exit(main())