unsloth/scripts/verify_comment_only_diff.py

# Unsloth - 2x faster, 60% less VRAM LLM training and finetuning
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.

"""Deterministic comment / docstring-only verifier.

Compares a list of changed files between two git refs and reports whether
each diff is strictly comments / docstrings (Python) or comments
(YAML / GitHub Actions). Useful for gating a "comment trim" /
"docstring refactor" PR against accidental code drift.

Per .py file: parse both revs into AST, strip module / class / function
docstrings, then compare ast.unparse output. Pure Python comments are
discarded by the parser by construction, so any post-strip diff is real
code. Per .yml file: yaml.safe_load both sides and compare the parsed
Python object; if scalar values differ, also strip shell comments inside
``run: |`` block bodies before comparing. Exit code 0 = all OK, 1 = at
least one file has a real (non-comment) diff or an error.

Usage:
    python scripts/verify_comment_only_diff.py [--base REF] [--head REF] path ...

Defaults: --base origin/main, --head HEAD. Paths are repo-relative.

Example:
    git diff --name-only origin/main..HEAD \\
      | xargs python scripts/verify_comment_only_diff.py --base origin/main
"""

from __future__ import annotations

import argparse
import ast
import difflib
import subprocess
import sys
from typing import Any

import yaml


def _git_show(rev: str, path: str) -> str:
    return subprocess.check_output(
        ["git", "show", f"{rev}:{path}"],
        text = True,
        stderr = subprocess.DEVNULL,
    )


def _strip_docstrings(tree: ast.AST) -> ast.AST:
    """Remove every string-literal docstring (Module / FunctionDef /
    AsyncFunctionDef / ClassDef). Empty body becomes ``pass`` so
    ast.unparse stays valid."""
    for node in ast.walk(tree):
        if isinstance(
            node,
            (ast.Module, ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef),
        ):
            body = getattr(node, "body", None)
            if not body:
                continue
            first = body[0]
            if (
                isinstance(first, ast.Expr)
                and isinstance(first.value, ast.Constant)
                and isinstance(first.value.value, str)
            ):
                node.body = body[1:]
                if not node.body:
                    node.body = [ast.Pass()]
    return tree


def _normalize_py(src: str) -> str:
    tree = ast.parse(src)
    tree = _strip_docstrings(tree)
    return ast.unparse(tree)


def _strip_shell_comments(s: str) -> str:
    """Strip pure-comment lines and inline trailing comments from a shell
    snippet, then collapse runs of blank lines. Heuristic only: leaves a
    line untouched if it has an odd quote count (open string)."""
    out = []
    for line in s.splitlines():
        stripped = line.lstrip()
        if stripped.startswith("#"):
            continue
        has_single = line.count("'") % 2 == 0
        has_double = line.count('"') % 2 == 0
        if has_single and has_double:
            idx = line.find(" #")
            if idx >= 0:
                line = line[:idx].rstrip()
        out.append(line)
    norm = []
    prev_blank = False
    for line in out:
        if line.strip() == "":
            if prev_blank:
                continue
            prev_blank = True
        else:
            prev_blank = False
        norm.append(line)
    return "\n".join(norm).strip()


def _normalize_yaml_run_strings(obj: Any) -> Any:
    """Walk the parsed YAML object; for any multi-line string (i.e. a
    ``run: |`` script body), strip shell comments. Returns a normalised
    copy."""
    if isinstance(obj, dict):
        return {k: _normalize_yaml_run_strings(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [_normalize_yaml_run_strings(x) for x in obj]
    if isinstance(obj, str) and "\n" in obj:
        return _strip_shell_comments(obj)
    return obj


def _walk_yaml_diff(b: Any, a: Any, prefix: str = "") -> None:
    """Print a path-keyed summary of the first structural / scalar diff."""
    if type(b) is not type(a):
        print(
            f"     type-diff at {prefix or '/'}: "
            f"{type(b).__name__} -> {type(a).__name__}",
        )
        return
    if isinstance(b, dict):
        keys = sorted((set(b.keys()) | set(a.keys())), key = lambda x: str(x))
        for k in keys:
            if k not in b:
                print(f"     added key {prefix}/{k}")
            elif k not in a:
                print(f"     removed key {prefix}/{k}")
            else:
                _walk_yaml_diff(b[k], a[k], f"{prefix}/{k}")
    elif isinstance(b, list):
        if len(b) != len(a):
            print(
                f"     list len at {prefix or '/'}: " f"{len(b)} -> {len(a)}",
            )
        for i, (bi, ai) in enumerate(zip(b, a)):
            _walk_yaml_diff(bi, ai, f"{prefix}[{i}]")
    elif b != a:
        bs = repr(b)[:300]
        as_ = repr(a)[:300]
        print(f"     scalar at {prefix or '/'}:")
        print(f"       before: {bs}")
        print(f"       after:  {as_}")


def _verify_python(path: str, before: str, after: str) -> bool:
    try:
        norm_before = _normalize_py(before)
        norm_after = _normalize_py(after)
    except SyntaxError as exc:
        print(f"FAIL {path}: SyntaxError parsing -- {exc}")
        return False
    if norm_before == norm_after:
        print(f"OK   {path}  (AST identical after docstring strip)")
        return True
    diff = list(
        difflib.unified_diff(
            norm_before.splitlines(),
            norm_after.splitlines(),
            fromfile = f"{path}@before",
            tofile = f"{path}@after",
            n = 2,
        )
    )
    print(f"FAIL {path}: AST differs after docstring strip:")
    for line in diff[:40]:
        print(f"     {line}")
    return False


def _verify_yaml(path: str, before: str, after: str) -> bool:
    try:
        raw_before = yaml.safe_load(before)
        raw_after = yaml.safe_load(after)
    except yaml.YAMLError as exc:
        print(f"FAIL {path}: YAML parse error -- {exc}")
        return False
    if raw_before == raw_after:
        print(f"OK   {path}  (YAML parsed object identical)")
        return True
    norm_before = _normalize_yaml_run_strings(raw_before)
    norm_after = _normalize_yaml_run_strings(raw_after)
    if norm_before == norm_after:
        print(
            f"OK   {path}  (YAML parsed object identical after "
            f"stripping shell comments from run: bodies)",
        )
        return True
    print(
        f"FAIL {path}: YAML parsed objects still differ after stripping "
        f"shell comments from `run:` bodies.",
    )
    _walk_yaml_diff(norm_before, norm_after)
    return False


def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(
        description = "Verify each path's diff between BASE and HEAD is "
        "strictly comments / docstrings.",
    )
    parser.add_argument("--base", default = "origin/main", help = "base git ref")
    parser.add_argument("--head", default = "HEAD", help = "head git ref")
    parser.add_argument("paths", nargs = "+", help = "repo-relative paths")
    args = parser.parse_args(argv)

    rc = 0
    print(f"Comparing {len(args.paths)} files: {args.base} vs {args.head}\n")
    for path in args.paths:
        try:
            before = _git_show(args.base, path)
            after = _git_show(args.head, path)
        except subprocess.CalledProcessError as exc:
            print(f"SKIP {path}: {exc}")
            continue

        if path.endswith(".py"):
            if not _verify_python(path, before, after):
                rc = 1
        elif path.endswith((".yml", ".yaml")):
            if not _verify_yaml(path, before, after):
                rc = 1
        else:
            print(f"NOTE {path}: not .py or .yaml -- skipped automated check.")

    return rc


if __name__ == "__main__":
    sys.exit(main())