Pulse/scripts/release_control/contract_audit.py
2026-04-09 09:35:28 +01:00

609 lines
24 KiB
Python

#!/usr/bin/env python3
"""Machine audit for active release profile subsystem contracts."""
from __future__ import annotations
import argparse
import json
from pathlib import Path
import re
import sys
from typing import Any
from canonical_completion_guard import REPO_ROOT, subsystem_matches_path
from control_plane import DEFAULT_CONTROL_PLANE
from repo_file_io import load_repo_json
from subsystem_contracts import tracked_contract_files
CONTRACTS_DIR = DEFAULT_CONTROL_PLANE["subsystems_dir_path"]
REGISTRY_PATH = DEFAULT_CONTROL_PLANE["registry_path"]
STATUS_PATH = DEFAULT_CONTROL_PLANE["status_path"]
REGISTRY_REL = DEFAULT_CONTROL_PLANE["registry_rel"]
STATUS_REL = DEFAULT_CONTROL_PLANE["status_rel"]
TEMPLATE_REL = DEFAULT_CONTROL_PLANE["subsystem_contract_template_rel"]
REQUIRED_SECTIONS = [
"## Contract Metadata",
"## Purpose",
"## Canonical Files",
"## Shared Boundaries",
"## Extension Points",
"## Forbidden Paths",
"## Completion Obligations",
"## Current State",
]
LIST_SECTIONS = {
"## Canonical Files",
"## Shared Boundaries",
"## Extension Points",
"## Forbidden Paths",
"## Completion Obligations",
}
METADATA_REQUIRED_FIELDS = {
"subsystem_id",
"lane",
"contract_file",
"status_file",
"registry_file",
"dependency_subsystem_ids",
}
METADATA_REQUIRED_STRING_FIELDS = {
"subsystem_id",
"lane",
"contract_file",
"status_file",
"registry_file",
}
PATH_SUFFIXES = (
".go",
".json",
".md",
".mod",
".mjs",
".py",
".sh",
".sum",
".ts",
".tsx",
".yaml",
".yml",
)
WORKSPACE_REPOS_ROOT = REPO_ROOT.parent
def sorted_casefold(values: list[str]) -> list[str]:
return sorted(values, key=lambda value: value.casefold())
def load_registry_payload(*, staged: bool = False) -> dict[str, Any]:
return load_repo_json(REGISTRY_PATH, staged=staged)
def load_status_payload(*, staged: bool = False) -> dict[str, Any]:
return load_repo_json(STATUS_PATH, staged=staged)
def section_body(lines: list[str], heading: str) -> list[str]:
start = next(index for index, line in enumerate(lines) if line == heading) + 1
end = len(lines)
for index in range(start, len(lines)):
if lines[index].startswith("## "):
end = index
break
return lines[start:end]
def section_list_items(body_lines: list[str]) -> list[tuple[int, str]]:
items: list[tuple[int, str]] = []
for index, line in enumerate(body_lines):
stripped = line.strip()
if not stripped:
continue
for marker in [f"{n}." for n in range(1, 100)]:
if stripped.startswith(marker + " "):
items.append((index, stripped[len(marker) + 1 :]))
break
return items
def looks_like_repo_path(token: str) -> bool:
candidate = token.strip()
return "/" in candidate or candidate.endswith(PATH_SUFFIXES)
def repo_roots_for_status(status_payload: dict[str, Any]) -> dict[str, Path]:
active_repos = [
repo_id
for repo_id in status_payload.get("scope", {}).get("active_repos", [])
if isinstance(repo_id, str) and repo_id.strip()
]
repo_roots = {REPO_ROOT.name: REPO_ROOT}
for repo_id in active_repos:
if repo_id == REPO_ROOT.name:
continue
repo_roots[repo_id] = WORKSPACE_REPOS_ROOT / repo_id
return repo_roots
def resolve_repo_path_token(token: str, *, repo_roots: dict[str, Path]) -> Path | None:
raw = token.rstrip("/") if token.endswith("/") else token
if not raw:
return None
repo_id = REPO_ROOT.name
repo_rel = raw
if ":" in raw:
repo_id, repo_rel = raw.split(":", 1)
if not repo_id:
return None
candidate = Path(raw)
normalized = candidate.as_posix()
if candidate.is_absolute() or raw.startswith("../") or "/../" in raw or normalized != raw:
return None
repo_candidate = Path(repo_rel)
repo_normalized = repo_candidate.as_posix()
if (
repo_candidate.is_absolute()
or repo_rel.startswith("../")
or "/../" in repo_rel
or repo_normalized != repo_rel
):
return None
repo_root = repo_roots.get(repo_id)
if repo_root is None:
return None
return repo_root / repo_rel
def validate_repo_path_token(
token: str,
*,
rel: str,
heading: str,
errors: list[str],
repo_roots: dict[str, Path],
) -> None:
resolved = resolve_repo_path_token(token, repo_roots=repo_roots)
if resolved is None:
errors.append(f"{rel} {heading} contains non-clean repo-relative path {token!r}")
return
if not resolved.exists():
errors.append(f"{rel} {heading} references missing path {token!r}")
return
if token.endswith("/") and not resolved.is_dir():
errors.append(f"{rel} {heading} expects directory path {token!r}")
def parse_contract_metadata(body_lines: list[str]) -> tuple[dict[str, Any] | None, list[str]]:
errors: list[str] = []
meaningful = [line for line in body_lines if line.strip()]
if len(meaningful) < 3:
return None, ["contract metadata section must contain a JSON fenced block"]
if meaningful[0].strip() != "```json":
errors.append("contract metadata section must start with ```json")
return None, errors
if meaningful[-1].strip() != "```":
errors.append("contract metadata section must end with ```")
return None, errors
json_block = "\n".join(meaningful[1:-1]).strip()
if not json_block:
errors.append("contract metadata JSON block must not be empty")
return None, errors
try:
payload = json.loads(json_block)
except json.JSONDecodeError as exc:
errors.append(f"contract metadata JSON is invalid: {exc}")
return None, errors
if not isinstance(payload, dict):
errors.append("contract metadata JSON must be an object")
return None, errors
return payload, errors
def audit_contract_text(
rel: str,
content: str,
*,
repo_roots: dict[str, Path],
) -> tuple[dict[str, Any], list[str]]:
errors: list[str] = []
path_references: list[dict[str, str]] = []
section_items: dict[str, list[str]] = {}
lines = content.splitlines()
if not lines or not lines[0].startswith("# "):
errors.append(f"{rel} must start with a level-1 heading")
heading_positions: dict[str, int] = {}
for index, line in enumerate(lines):
if line in REQUIRED_SECTIONS:
if line in heading_positions:
errors.append(f"{rel} duplicates required section {line!r}")
heading_positions[line] = index
positions: list[int] = []
for heading in REQUIRED_SECTIONS:
if heading not in heading_positions:
errors.append(f"{rel} missing required section {heading!r}")
continue
positions.append(heading_positions[heading])
if positions and positions != sorted(positions):
errors.append(f"{rel} required sections must appear in canonical order")
metadata: dict[str, Any] | None = None
if "## Contract Metadata" in heading_positions:
metadata, metadata_errors = parse_contract_metadata(section_body(lines, "## Contract Metadata"))
errors.extend(f"{rel} {error}" for error in metadata_errors)
for heading in REQUIRED_SECTIONS:
if heading not in heading_positions or heading == "## Contract Metadata":
continue
body = section_body(lines, heading)
if not any(line.strip() for line in body):
errors.append(f"{rel} section {heading!r} must not be empty")
continue
if heading in LIST_SECTIONS:
items = section_list_items(body)
if not items:
errors.append(f"{rel} section {heading!r} must contain a numbered list")
continue
section_items[heading] = [item for _, item in items]
if heading == "## Canonical Files":
for _, item in items:
path_tokens = [token for token in re.findall(r"`([^`]+)`", item) if looks_like_repo_path(token)]
if not path_tokens:
errors.append(f"{rel} section {heading!r} entries must include at least one repo path")
continue
for token in path_tokens:
validate_repo_path_token(
token,
rel=rel,
heading=heading,
errors=errors,
repo_roots=repo_roots,
)
path_references.append({"heading": heading, "path": token})
if heading == "## Extension Points":
for _, item in items:
for token in re.findall(r"`([^`]+)`", item):
if looks_like_repo_path(token):
validate_repo_path_token(
token,
rel=rel,
heading=heading,
errors=errors,
repo_roots=repo_roots,
)
path_references.append({"heading": heading, "path": token})
return {
"title": lines[0].strip() if lines else "",
"metadata": metadata,
"path_references": path_references,
"section_items": section_items,
}, errors
def path_owner_ids(registry_subsystems: list[dict[str, Any]], path: str) -> list[str]:
return [
str(subsystem.get("id", "")).strip()
for subsystem in registry_subsystems
if isinstance(subsystem, dict)
if subsystem_matches_path(subsystem, path)
]
def expected_shared_boundaries(
registry_payload: dict[str, Any],
subsystem_id: str,
) -> list[dict[str, Any]]:
shared_ownerships = registry_payload.get("shared_ownerships", [])
if not isinstance(shared_ownerships, list):
return []
entries = [
entry
for entry in shared_ownerships
if isinstance(entry, dict)
if subsystem_id in entry.get("subsystems", [])
]
return sorted(entries, key=lambda entry: str(entry.get("path", "")).casefold())
def render_shared_boundary_item(entry: dict[str, Any], subsystem_id: str) -> str:
path = str(entry.get("path", "")).strip()
rationale = str(entry.get("rationale", "")).strip()
partner_ids = sorted_casefold(
[
other_id
for other_id in entry.get("subsystems", [])
if isinstance(other_id, str) and other_id != subsystem_id
]
)
partner_clause = ", ".join(f"`{partner_id}`" for partner_id in partner_ids)
return f"`{path}` shared with {partner_clause}: {rationale}."
def audit_contract_payload(
*,
registry_payload: dict[str, Any],
status_payload: dict[str, Any],
contract_texts: dict[str, str],
) -> dict[str, Any]:
errors: list[str] = []
warnings: list[str] = []
registry_subsystems = registry_payload.get("subsystems")
if not isinstance(registry_subsystems, list):
return {
"errors": ["registry.json missing subsystems list"],
"warnings": warnings,
"summary": {},
"contracts": [],
}
status_lanes = {
lane.get("id")
for lane in status_payload.get("lanes", [])
if isinstance(lane, dict) and isinstance(lane.get("id"), str)
}
expected_contracts = {
str(subsystem.get("contract")): subsystem
for subsystem in registry_subsystems
if isinstance(subsystem, dict) and isinstance(subsystem.get("contract"), str)
}
repo_roots = repo_roots_for_status(status_payload)
expected_subsystem_ids = {
str(subsystem.get("id", "")).strip()
for subsystem in registry_subsystems
if isinstance(subsystem, dict) and isinstance(subsystem.get("id"), str)
}
actual_contracts = {rel for rel in contract_texts if rel.endswith(".md") and rel != TEMPLATE_REL}
missing_contracts = sorted(set(expected_contracts) - actual_contracts)
extra_contracts = sorted(actual_contracts - set(expected_contracts))
for rel in missing_contracts:
errors.append(f"missing registered subsystem contract {rel}")
for rel in extra_contracts:
errors.append(f"unregistered subsystem contract present: {rel}")
contract_summaries: list[dict[str, Any]] = []
seen_subsystem_ids: set[str] = set()
for rel in sorted(actual_contracts):
parsed, parse_errors = audit_contract_text(rel, contract_texts[rel], repo_roots=repo_roots)
errors.extend(parse_errors)
metadata = parsed.get("metadata")
subsystem = expected_contracts.get(rel)
if metadata is None:
contract_summaries.append({"contract": rel})
continue
metadata_keys = set(metadata)
if metadata_keys != METADATA_REQUIRED_FIELDS:
errors.append(
f"{rel} contract metadata keys = {sorted(metadata_keys)!r}, want {sorted(METADATA_REQUIRED_FIELDS)!r}"
)
for field in sorted(METADATA_REQUIRED_STRING_FIELDS):
value = metadata.get(field)
if not isinstance(value, str) or not value.strip():
errors.append(f"{rel} contract metadata field {field!r} must be a non-empty string")
subsystem_id = str(metadata.get("subsystem_id", "")).strip()
lane = str(metadata.get("lane", "")).strip()
contract_file = str(metadata.get("contract_file", "")).strip()
status_file = str(metadata.get("status_file", "")).strip()
registry_file = str(metadata.get("registry_file", "")).strip()
dependency_subsystem_ids = metadata.get("dependency_subsystem_ids")
shared_boundary_items = list(parsed.get("section_items", {}).get("## Shared Boundaries", []))
normalized_dependencies: list[str] = []
if not isinstance(dependency_subsystem_ids, list):
errors.append(f"{rel} contract metadata field 'dependency_subsystem_ids' must be a list")
else:
if len(dependency_subsystem_ids) != len(set(dependency_subsystem_ids)):
errors.append(f"{rel} contract metadata dependency_subsystem_ids must not contain duplicates")
for index, dependency in enumerate(dependency_subsystem_ids):
if not isinstance(dependency, str) or not dependency.strip():
errors.append(
f"{rel} contract metadata dependency_subsystem_ids[{index}] must be a non-empty string"
)
continue
normalized_dependencies.append(dependency.strip())
if normalized_dependencies != sorted_casefold(normalized_dependencies):
errors.append(f"{rel} contract metadata dependency_subsystem_ids must be sorted lexicographically")
if subsystem_id:
if subsystem_id in seen_subsystem_ids:
errors.append(f"{rel} duplicates contract metadata subsystem_id {subsystem_id!r}")
seen_subsystem_ids.add(subsystem_id)
if contract_file and contract_file != rel:
errors.append(f"{rel} contract metadata contract_file = {contract_file!r}, want {rel!r}")
if status_file and status_file != STATUS_REL:
errors.append(f"{rel} contract metadata status_file must be {STATUS_REL!r}")
if registry_file and registry_file != REGISTRY_REL:
errors.append(f"{rel} contract metadata registry_file must be {REGISTRY_REL!r}")
if lane and lane not in status_lanes:
errors.append(f"{rel} contract metadata references unknown lane {lane!r}")
if subsystem_id and subsystem_id in normalized_dependencies:
errors.append(f"{rel} contract metadata must not declare self dependency {subsystem_id!r}")
for dependency in normalized_dependencies:
if dependency not in expected_subsystem_ids:
errors.append(f"{rel} contract metadata references unknown dependency subsystem {dependency!r}")
if subsystem is None:
errors.append(f"{rel} is not registered in registry.json")
else:
expected_id = str(subsystem.get("id", "")).strip()
expected_lane = str(subsystem.get("lane", "")).strip()
if subsystem_id and subsystem_id != expected_id:
errors.append(f"{rel} contract metadata subsystem_id = {subsystem_id!r}, want {expected_id!r}")
if lane and lane != expected_lane:
errors.append(f"{rel} contract metadata lane = {lane!r}, want {expected_lane!r}")
actual_dependency_ids: set[str] = set()
for reference in parsed.get("path_references", []):
heading = str(reference.get("heading", "")).strip()
path = str(reference.get("path", "")).strip()
if not heading or not path:
continue
owner_ids = path_owner_ids(registry_subsystems, path)
if subsystem_id and subsystem_id in owner_ids:
continue
if len(owner_ids) > 1:
errors.append(f"{rel} {heading} path {path!r} resolves to multiple subsystem owners {owner_ids!r}")
continue
if not owner_ids:
continue
owner_id = owner_ids[0]
actual_dependency_ids.add(owner_id)
expected_dependencies = sorted_casefold(list(actual_dependency_ids))
if normalized_dependencies != expected_dependencies:
errors.append(
f"{rel} contract metadata dependency_subsystem_ids = {normalized_dependencies!r}, want {expected_dependencies!r}"
)
expected_shared = expected_shared_boundaries(registry_payload, subsystem_id)
expected_shared_paths = [
str(entry.get("path", "")).strip()
for entry in expected_shared
if isinstance(entry.get("path"), str) and str(entry.get("path", "")).strip()
]
expected_shared_items = [render_shared_boundary_item(entry, subsystem_id) for entry in expected_shared]
if not expected_shared_paths:
if shared_boundary_items != ["None."]:
errors.append(
f"{rel} section '## Shared Boundaries' must contain exactly `1. None.` when no registry shared ownership entries exist"
)
else:
actual_shared_paths: list[str] = []
seen_shared_paths: set[str] = set()
for item in shared_boundary_items:
tokens = re.findall(r"`([^`]+)`", item)
path_tokens = [token for token in tokens if looks_like_repo_path(token)]
if len(path_tokens) != 1:
errors.append(
f"{rel} section '## Shared Boundaries' entries must include exactly one repo path"
)
continue
shared_path = path_tokens[0]
validate_repo_path_token(
shared_path,
rel=rel,
heading="## Shared Boundaries",
errors=errors,
repo_roots=repo_roots,
)
actual_shared_paths.append(shared_path)
if shared_path in seen_shared_paths:
errors.append(
f"{rel} section '## Shared Boundaries' must not repeat shared path {shared_path!r}"
)
continue
seen_shared_paths.add(shared_path)
expected_entry = next(
(entry for entry in expected_shared if entry.get("path") == shared_path),
None,
)
if expected_entry is None:
continue
partner_ids = [
other_id
for other_id in expected_entry.get("subsystems", [])
if isinstance(other_id, str) and other_id != subsystem_id
]
for partner_id in partner_ids:
if partner_id not in tokens:
errors.append(
f"{rel} shared boundary {shared_path!r} must mention partner subsystem {partner_id!r} in backticks"
)
if actual_shared_paths != expected_shared_paths:
errors.append(
f"{rel} section '## Shared Boundaries' paths = {actual_shared_paths!r}, want {expected_shared_paths!r}"
)
if shared_boundary_items != expected_shared_items:
errors.append(
f"{rel} section '## Shared Boundaries' items = {shared_boundary_items!r}, want {expected_shared_items!r}"
)
contract_summaries.append(
{
"contract": rel,
"subsystem_id": subsystem_id,
"lane": lane,
"dependency_subsystem_ids": normalized_dependencies,
"title": parsed.get("title", ""),
}
)
if seen_subsystem_ids != expected_subsystem_ids:
missing_ids = sorted(expected_subsystem_ids - seen_subsystem_ids)
extra_ids = sorted(seen_subsystem_ids - expected_subsystem_ids)
for subsystem_id in missing_ids:
errors.append(f"missing contract metadata for registered subsystem_id {subsystem_id!r}")
for subsystem_id in extra_ids:
errors.append(f"unregistered subsystem_id present in contract metadata: {subsystem_id!r}")
return {
"errors": errors,
"warnings": warnings,
"summary": {
"contract_count": len(contract_summaries),
},
"contracts": contract_summaries,
}
def parse_args(argv: list[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Audit active release profile subsystem contracts.")
parser.add_argument(
"--check",
action="store_true",
help="Exit non-zero if the contract audit finds any errors.",
)
parser.add_argument(
"--pretty",
action="store_true",
help="Print a concise human-readable summary instead of JSON.",
)
parser.add_argument(
"--staged",
action="store_true",
help="Read subsystem contracts from the git index instead of the working tree.",
)
return parser.parse_args(argv)
def render_pretty(report: dict[str, Any]) -> str:
lines: list[str] = []
summary = report.get("summary", {})
if summary:
lines.append(f"summary: contracts={summary.get('contract_count', 0)}")
for contract in report.get("contracts", []):
lines.append(
f"{contract.get('subsystem_id', '?')}: lane={contract.get('lane', '?')} contract={contract.get('contract', '?')}"
)
if report.get("warnings"):
lines.append("warnings:")
for warning in report["warnings"]:
lines.append(f" - {warning}")
if report.get("errors"):
lines.append("errors:")
for err in report["errors"]:
lines.append(f" - {err}")
return "\n".join(lines)
def main(argv: list[str] | None = None) -> int:
args = parse_args(list(argv or []))
report = audit_contract_payload(
registry_payload=load_registry_payload(staged=args.staged),
status_payload=load_status_payload(staged=args.staged),
contract_texts=tracked_contract_files(staged=args.staged),
)
output = render_pretty(report) if args.pretty else json.dumps(report, indent=2, sort_keys=True)
print(output)
if args.check and report["errors"]:
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))