Pulse/scripts/release_control/registry_audit.py
2026-03-28 18:07:44 +00:00

656 lines
28 KiB
Python

#!/usr/bin/env python3
"""Machine audit for the active release profile subsystem registry."""
from __future__ import annotations
import argparse
import json
import os
from pathlib import Path
import re
import subprocess
import sys
from typing import Any
from canonical_completion_guard import (
REPO_ROOT,
is_ignored_runtime_file,
is_test_or_fixture,
path_policy_matches,
subsystem_matches_path,
)
from control_plane import DEFAULT_CONTROL_PLANE
from repo_file_io import load_repo_json
from status_audit import load_status_payload
REGISTRY_PATH = DEFAULT_CONTROL_PLANE["registry_path"]
REGISTRY_SCHEMA_PATH = DEFAULT_CONTROL_PLANE["registry_schema_path"]
LANE_RE = re.compile(r"^L[0-9]+$")
def load_registry_payload(*, staged: bool = False) -> dict[str, Any]:
return load_repo_json(REGISTRY_PATH, staged=staged)
def load_registry_schema(*, staged: bool = False) -> dict[str, Any]:
return load_repo_json(REGISTRY_SCHEMA_PATH, staged=staged)
def schema_required(schema: dict[str, Any], definition: str | None = None) -> set[str]:
target = schema if definition is None else schema["$defs"][definition]
return set(target["required"])
def registry_schema_contract(*, staged: bool = False) -> dict[str, Any]:
schema = load_registry_schema(staged=staged)
return {
"schema": schema,
"required_top_level_fields": schema_required(schema),
"required_subsystem_fields": schema_required(schema, "subsystem"),
"required_verification_fields": schema_required(schema, "verification"),
"required_path_policy_fields": schema_required(schema, "path_policy"),
"required_shared_ownership_fields": schema_required(schema, "shared_ownership"),
}
DEFAULT_REGISTRY_SCHEMA_CONTRACT = registry_schema_contract()
def sorted_casefold(values: list[str]) -> list[str]:
return sorted(values, key=lambda value: value.casefold())
def tracked_repo_files() -> set[str]:
return tracked_workspace_files(active_repos=[REPO_ROOT.name], local_repo=REPO_ROOT.name)
def tracked_workspace_files(*, active_repos: list[str], local_repo: str) -> set[str]:
files: set[str] = set()
repos_root = REPO_ROOT.parent
for repo_id in active_repos:
repo_root = REPO_ROOT if repo_id == local_repo else repos_root / repo_id
if not repo_root.exists():
continue
env = os.environ.copy()
if repo_root != REPO_ROOT:
env.pop("GIT_INDEX_FILE", None)
result = subprocess.run(
["git", "ls-files", "-z"],
cwd=repo_root,
check=True,
capture_output=True,
text=False,
env=env,
)
for entry in result.stdout.split(b"\x00"):
if not entry:
continue
rel = entry.decode("utf-8")
files.add(rel if repo_id == local_repo else f"{repo_id}:{rel}")
# Governance files are filesystem-only (gitignored). Supplement with any
# files that exist on disk under the control-plane contracts directory so
# contract path references in the registry resolve correctly.
contracts_dir = Path(DEFAULT_CONTROL_PLANE["subsystems_dir_path"])
if contracts_dir.exists():
for f in contracts_dir.iterdir():
if f.is_file():
files.add(f.relative_to(REPO_ROOT).as_posix())
return files
def clean_relative_path(path: str) -> str:
return Path(path).as_posix()
def validate_path_reference(
path: str,
*,
context: str,
errors: list[str],
tracked_files: set[str],
require_file: bool = True,
) -> None:
normalized = clean_relative_path(path)
if normalized != path or path.startswith("../") or "/../" in path or Path(path).is_absolute():
errors.append(f"{context} must be a clean repo-relative path: {path!r}")
return
if require_file and path not in tracked_files:
errors.append(f"{context} missing tracked file {path!r}")
def validate_prefix(
prefix: str,
*,
context: str,
errors: list[str],
tracked_files: set[str],
) -> None:
normalized = clean_relative_path(prefix.rstrip("/"))
raw = prefix.rstrip("/")
if normalized != raw or raw.startswith("../") or "/../" in raw or Path(raw).is_absolute():
errors.append(f"{context} must be a clean repo-relative prefix: {prefix!r}")
return
if not any(path.startswith(prefix) for path in tracked_files):
errors.append(f"{context} does not match any tracked files: {prefix!r}")
def owned_runtime_files(rule: dict[str, Any], tracked_files: set[str]) -> list[str]:
return sorted(
path
for path in tracked_files
if subsystem_matches_path(rule, path)
if not is_test_or_fixture(path)
if not is_ignored_runtime_file(path)
)
def audit_registry_payload(
payload: dict[str, Any],
*,
tracked_files: set[str],
status_lane_ids: set[str],
schema_contract: dict[str, Any] | None = None,
) -> dict[str, Any]:
contract = schema_contract or DEFAULT_REGISTRY_SCHEMA_CONTRACT
schema = contract["schema"]
required_top_level_fields = set(contract["required_top_level_fields"])
required_subsystem_fields = set(contract["required_subsystem_fields"])
required_verification_fields = set(contract["required_verification_fields"])
required_path_policy_fields = set(contract["required_path_policy_fields"])
required_shared_ownership_fields = set(contract["required_shared_ownership_fields"])
errors: list[str] = []
warnings: list[str] = []
for field in sorted(required_top_level_fields):
if field not in payload:
errors.append(f"registry.json missing required field {field}")
version = payload.get("version")
if version != schema["properties"]["version"]["const"]:
errors.append(f"registry.json version must be {schema['properties']['version']['const']}")
raw_subsystems = payload.get("subsystems")
if not isinstance(raw_subsystems, list) or not raw_subsystems:
errors.append("registry.json missing non-empty subsystems list")
return {"errors": errors, "warnings": warnings, "summary": {}}
raw_shared_ownerships = payload.get("shared_ownerships")
if not isinstance(raw_shared_ownerships, list):
errors.append("registry.json missing shared_ownerships list")
raw_shared_ownerships = []
seen_ids: set[str] = set()
seen_contracts: set[str] = set()
subsystem_summaries: list[dict[str, Any]] = []
subsystem_order: list[str] = []
for index, raw_subsystem in enumerate(raw_subsystems):
context = f"subsystems[{index}]"
if not isinstance(raw_subsystem, dict):
errors.append(f"{context} must be an object")
continue
for field in sorted(required_subsystem_fields):
if field not in raw_subsystem:
errors.append(f"{context} missing required field {field}")
subsystem_id = raw_subsystem.get("id")
if not isinstance(subsystem_id, str) or not subsystem_id.strip():
errors.append(f"{context} missing non-empty string id")
continue
if subsystem_id in seen_ids:
errors.append(f"{context} duplicates subsystem id {subsystem_id!r}")
seen_ids.add(subsystem_id)
subsystem_order.append(subsystem_id)
lane = raw_subsystem.get("lane")
if not isinstance(lane, str) or not LANE_RE.match(lane):
errors.append(f"{context} has invalid lane {lane!r}")
elif lane not in status_lane_ids:
errors.append(f"{context} references unknown status lane {lane!r}")
contract = raw_subsystem.get("contract")
if not isinstance(contract, str) or not contract.strip():
errors.append(f"{context} missing non-empty string contract")
else:
validate_path_reference(
contract,
context=f"{context}.contract",
errors=errors,
tracked_files=tracked_files,
)
if contract in seen_contracts:
errors.append(f"{context} duplicates contract path {contract!r}")
seen_contracts.add(contract)
owned_prefixes = raw_subsystem.get("owned_prefixes")
if not isinstance(owned_prefixes, list):
errors.append(f"{context}.owned_prefixes must be a list")
owned_prefixes = []
else:
if len(owned_prefixes) != len(set(owned_prefixes)):
errors.append(f"{context}.owned_prefixes must not contain duplicates")
if owned_prefixes != sorted_casefold(owned_prefixes):
errors.append(f"{context}.owned_prefixes must be sorted lexicographically")
for prefix_index, prefix in enumerate(owned_prefixes):
if not isinstance(prefix, str) or not prefix.strip():
errors.append(f"{context}.owned_prefixes[{prefix_index}] must be a non-empty string")
continue
validate_prefix(
prefix,
context=f"{context}.owned_prefixes[{prefix_index}]",
errors=errors,
tracked_files=tracked_files,
)
owned_files = raw_subsystem.get("owned_files")
if not isinstance(owned_files, list):
errors.append(f"{context}.owned_files must be a list")
owned_files = []
else:
if len(owned_files) != len(set(owned_files)):
errors.append(f"{context}.owned_files must not contain duplicates")
if owned_files != sorted_casefold(owned_files):
errors.append(f"{context}.owned_files must be sorted lexicographically")
for file_index, path in enumerate(owned_files):
if not isinstance(path, str) or not path.strip():
errors.append(f"{context}.owned_files[{file_index}] must be a non-empty string")
continue
validate_path_reference(
path,
context=f"{context}.owned_files[{file_index}]",
errors=errors,
tracked_files=tracked_files,
)
verification = raw_subsystem.get("verification")
if not isinstance(verification, dict):
errors.append(f"{context}.verification must be an object")
continue
for field in sorted(required_verification_fields):
if field not in verification:
errors.append(f"{context}.verification missing required field {field}")
if not isinstance(verification.get("allow_same_subsystem_tests"), bool):
errors.append(f"{context}.verification.allow_same_subsystem_tests must be a bool")
if not isinstance(verification.get("require_explicit_path_policy_coverage"), bool):
errors.append(f"{context}.verification.require_explicit_path_policy_coverage must be a bool")
elif verification.get("require_explicit_path_policy_coverage") is not True:
errors.append(f"{context}.verification.require_explicit_path_policy_coverage must be true")
test_prefixes = verification.get("test_prefixes")
if not isinstance(test_prefixes, list):
errors.append(f"{context}.verification.test_prefixes must be a list")
test_prefixes = []
else:
if len(test_prefixes) != len(set(test_prefixes)):
errors.append(f"{context}.verification.test_prefixes must not contain duplicates")
if test_prefixes != sorted_casefold(test_prefixes):
errors.append(f"{context}.verification.test_prefixes must be sorted lexicographically")
for prefix_index, prefix in enumerate(test_prefixes):
if not isinstance(prefix, str) or not prefix.strip():
errors.append(f"{context}.verification.test_prefixes[{prefix_index}] must be a non-empty string")
continue
validate_prefix(
prefix,
context=f"{context}.verification.test_prefixes[{prefix_index}]",
errors=errors,
tracked_files=tracked_files,
)
exact_files = verification.get("exact_files")
if not isinstance(exact_files, list):
errors.append(f"{context}.verification.exact_files must be a list")
exact_files = []
else:
if len(exact_files) != len(set(exact_files)):
errors.append(f"{context}.verification.exact_files must not contain duplicates")
if exact_files != sorted_casefold(exact_files):
errors.append(f"{context}.verification.exact_files must be sorted lexicographically")
for file_index, path in enumerate(exact_files):
if not isinstance(path, str) or not path.strip():
errors.append(f"{context}.verification.exact_files[{file_index}] must be a non-empty string")
continue
validate_path_reference(
path,
context=f"{context}.verification.exact_files[{file_index}]",
errors=errors,
tracked_files=tracked_files,
)
path_policies = verification.get("path_policies")
if not isinstance(path_policies, list):
errors.append(f"{context}.verification.path_policies must be a list")
path_policies = []
seen_policy_ids: set[str] = set()
valid_policies: list[tuple[str, dict[str, Any]]] = []
for policy_index, raw_policy in enumerate(path_policies):
policy_context = f"{context}.verification.path_policies[{policy_index}]"
if not isinstance(raw_policy, dict):
errors.append(f"{policy_context} must be an object")
continue
for field in sorted(required_path_policy_fields):
if field not in raw_policy:
errors.append(f"{policy_context} missing required field {field}")
policy_id = raw_policy.get("id")
if not isinstance(policy_id, str) or not policy_id.strip():
errors.append(f"{policy_context} missing non-empty string id")
elif policy_id in seen_policy_ids:
errors.append(f"{policy_context} duplicates policy id {policy_id!r}")
else:
seen_policy_ids.add(policy_id)
label = raw_policy.get("label")
if not isinstance(label, str) or not label.strip():
errors.append(f"{policy_context} missing non-empty string label")
match_prefixes = raw_policy.get("match_prefixes")
if not isinstance(match_prefixes, list):
errors.append(f"{policy_context}.match_prefixes must be a list")
match_prefixes = []
else:
if len(match_prefixes) != len(set(match_prefixes)):
errors.append(f"{policy_context}.match_prefixes must not contain duplicates")
if match_prefixes != sorted_casefold(match_prefixes):
errors.append(f"{policy_context}.match_prefixes must be sorted lexicographically")
match_files = raw_policy.get("match_files")
if not isinstance(match_files, list):
errors.append(f"{policy_context}.match_files must be a list")
match_files = []
else:
if len(match_files) != len(set(match_files)):
errors.append(f"{policy_context}.match_files must not contain duplicates")
if match_files != sorted_casefold(match_files):
errors.append(f"{policy_context}.match_files must be sorted lexicographically")
if not match_prefixes and not match_files:
errors.append(f"{policy_context} must define at least one match_prefix or match_file")
for prefix_index, prefix in enumerate(match_prefixes):
if not isinstance(prefix, str) or not prefix.strip():
errors.append(f"{policy_context}.match_prefixes[{prefix_index}] must be a non-empty string")
continue
validate_prefix(
prefix,
context=f"{policy_context}.match_prefixes[{prefix_index}]",
errors=errors,
tracked_files=tracked_files,
)
for file_index, path in enumerate(match_files):
if not isinstance(path, str) or not path.strip():
errors.append(f"{policy_context}.match_files[{file_index}] must be a non-empty string")
continue
validate_path_reference(
path,
context=f"{policy_context}.match_files[{file_index}]",
errors=errors,
tracked_files=tracked_files,
)
if not isinstance(raw_policy.get("allow_same_subsystem_tests"), bool):
errors.append(f"{policy_context}.allow_same_subsystem_tests must be a bool")
policy_test_prefixes = raw_policy.get("test_prefixes")
if not isinstance(policy_test_prefixes, list):
errors.append(f"{policy_context}.test_prefixes must be a list")
policy_test_prefixes = []
else:
if len(policy_test_prefixes) != len(set(policy_test_prefixes)):
errors.append(f"{policy_context}.test_prefixes must not contain duplicates")
if policy_test_prefixes != sorted_casefold(policy_test_prefixes):
errors.append(f"{policy_context}.test_prefixes must be sorted lexicographically")
for prefix_index, prefix in enumerate(policy_test_prefixes):
if not isinstance(prefix, str) or not prefix.strip():
errors.append(f"{policy_context}.test_prefixes[{prefix_index}] must be a non-empty string")
continue
validate_prefix(
prefix,
context=f"{policy_context}.test_prefixes[{prefix_index}]",
errors=errors,
tracked_files=tracked_files,
)
policy_exact_files = raw_policy.get("exact_files")
if not isinstance(policy_exact_files, list):
errors.append(f"{policy_context}.exact_files must be a list")
policy_exact_files = []
else:
if len(policy_exact_files) != len(set(policy_exact_files)):
errors.append(f"{policy_context}.exact_files must not contain duplicates")
if policy_exact_files != sorted_casefold(policy_exact_files):
errors.append(f"{policy_context}.exact_files must be sorted lexicographically")
for file_index, path in enumerate(policy_exact_files):
if not isinstance(path, str) or not path.strip():
errors.append(f"{policy_context}.exact_files[{file_index}] must be a non-empty string")
continue
validate_path_reference(
path,
context=f"{policy_context}.exact_files[{file_index}]",
errors=errors,
tracked_files=tracked_files,
)
valid_policies.append((policy_context, raw_policy))
owned_runtime = owned_runtime_files(raw_subsystem, tracked_files)
previous_policies: list[dict[str, Any]] = []
for policy_context, policy in valid_policies:
matched_owned_runtime = [path for path in owned_runtime if path_policy_matches(policy, path)]
if not matched_owned_runtime:
errors.append(f"{policy_context} does not match any owned runtime files")
previous_policies.append(policy)
continue
if previous_policies and all(
any(path_policy_matches(previous_policy, path) for previous_policy in previous_policies)
for path in matched_owned_runtime
):
errors.append(
f"{policy_context} is unreachable because earlier path policies already match all owned runtime files"
)
previous_policies.append(policy)
uncovered_owned_runtime = [
path
for path in owned_runtime
if not any(path_policy_matches(policy, path) for _, policy in valid_policies)
]
if uncovered_owned_runtime:
for path in uncovered_owned_runtime:
errors.append(
f"{context} requires explicit path policy coverage but {path!r} falls back to default verification"
)
subsystem_summaries.append(
{
"id": subsystem_id,
"lane": lane,
"owned_runtime_file_count": len(owned_runtime),
"default_fallback_count": len(uncovered_owned_runtime),
"path_policy_count": len(path_policies),
}
)
if subsystem_order != sorted_casefold(subsystem_order):
errors.append("registry.json subsystems must be sorted by subsystem id")
overlap_index: dict[str, list[str]] = {}
for raw_subsystem in raw_subsystems:
if not isinstance(raw_subsystem, dict):
continue
subsystem_id = raw_subsystem.get("id")
if not isinstance(subsystem_id, str) or not subsystem_id.strip():
continue
for path in owned_runtime_files(raw_subsystem, tracked_files):
overlap_index.setdefault(path, []).append(subsystem_id)
actual_shared_ownership = {
path: sorted_casefold(subsystems)
for path, subsystems in overlap_index.items()
if len(subsystems) > 1
}
seen_shared_paths: set[str] = set()
declared_shared_paths: list[str] = []
for index, raw_shared in enumerate(raw_shared_ownerships):
context = f"shared_ownerships[{index}]"
if not isinstance(raw_shared, dict):
errors.append(f"{context} must be an object")
continue
for field in sorted(required_shared_ownership_fields):
if field not in raw_shared:
errors.append(f"{context} missing required field {field}")
path = raw_shared.get("path")
if not isinstance(path, str) or not path.strip():
errors.append(f"{context}.path must be a non-empty string")
continue
declared_shared_paths.append(path)
validate_path_reference(
path,
context=f"{context}.path",
errors=errors,
tracked_files=tracked_files,
)
if path in seen_shared_paths:
errors.append(f"{context}.path duplicates shared ownership entry for {path!r}")
seen_shared_paths.add(path)
rationale = raw_shared.get("rationale")
if not isinstance(rationale, str) or not rationale.strip():
errors.append(f"{context}.rationale must be a non-empty string")
subsystems = raw_shared.get("subsystems")
normalized_subsystems: list[str] = []
if not isinstance(subsystems, list):
errors.append(f"{context}.subsystems must be a list")
subsystems = []
else:
if len(subsystems) != len(set(subsystems)):
errors.append(f"{context}.subsystems must not contain duplicates")
for subsystem_index, subsystem_id in enumerate(subsystems):
if not isinstance(subsystem_id, str) or not subsystem_id.strip():
errors.append(f"{context}.subsystems[{subsystem_index}] must be a non-empty string")
continue
normalized_subsystems.append(subsystem_id)
if subsystem_id not in seen_ids:
errors.append(f"{context}.subsystems[{subsystem_index}] references unknown subsystem {subsystem_id!r}")
if normalized_subsystems != sorted_casefold(normalized_subsystems):
errors.append(f"{context}.subsystems must be sorted lexicographically")
actual = actual_shared_ownership.get(path)
if actual is None:
errors.append(f"{context}.path = {path!r} is not an actual shared-owned runtime file")
elif normalized_subsystems and normalized_subsystems != actual:
errors.append(f"{context}.subsystems = {normalized_subsystems!r}, want {actual!r}")
if declared_shared_paths != sorted_casefold(declared_shared_paths):
errors.append("registry.json shared_ownerships must be sorted by path")
declared_shared_ownership = set(declared_shared_paths)
for path in sorted(set(actual_shared_ownership) - declared_shared_ownership, key=str.casefold):
errors.append(
f"registry.json missing shared ownership entry for {path!r} owned by {actual_shared_ownership[path]!r}"
)
for path in sorted(declared_shared_ownership - set(actual_shared_ownership), key=str.casefold):
errors.append(f"registry.json shared ownership entry for {path!r} is stale")
return {
"errors": errors,
"warnings": warnings,
"summary": {
"shared_ownership_count": len(actual_shared_ownership),
"subsystem_count": len(subsystem_summaries),
"explicit_coverage_subsystems": sum(
1
for subsystem in raw_subsystems
if isinstance(subsystem, dict)
and isinstance(subsystem.get("verification"), dict)
and subsystem["verification"].get("require_explicit_path_policy_coverage") is True
),
},
"subsystems": subsystem_summaries,
}
def parse_args(argv: list[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Audit the active release profile subsystem registry.")
parser.add_argument(
"--check",
action="store_true",
help="Exit non-zero if the registry audit finds any errors.",
)
parser.add_argument(
"--pretty",
action="store_true",
help="Print a concise human-readable summary instead of JSON.",
)
parser.add_argument(
"--staged",
action="store_true",
help="Read registry control files from the git index instead of the working tree.",
)
return parser.parse_args(argv)
def render_pretty(report: dict[str, Any]) -> str:
lines: list[str] = []
summary = report.get("summary", {})
if summary:
lines.append(
"summary: "
f"subsystems={summary.get('subsystem_count', 0)} "
f"shared_ownerships={summary.get('shared_ownership_count', 0)} "
f"explicit_coverage={summary.get('explicit_coverage_subsystems', 0)}"
)
for subsystem in report.get("subsystems", []):
lines.append(
f"{subsystem['id']}: lane={subsystem['lane']} "
f"owned_runtime={subsystem['owned_runtime_file_count']} "
f"default_fallback={subsystem['default_fallback_count']} "
f"path_policies={subsystem['path_policy_count']}"
)
if report.get("warnings"):
lines.append("warnings:")
for warning in report["warnings"]:
lines.append(f" - {warning}")
if report.get("errors"):
lines.append("errors:")
for err in report["errors"]:
lines.append(f" - {err}")
return "\n".join(lines)
def main(argv: list[str] | None = None) -> int:
args = parse_args(list(argv or []))
status_payload = load_status_payload(staged=args.staged)
scope = status_payload.get("scope", {})
active_repos = [
repo_id
for repo_id in scope.get("active_repos", [])
if isinstance(repo_id, str) and repo_id.strip()
] or [REPO_ROOT.name]
lane_ids = {
lane.get("id")
for lane in status_payload.get("lanes", [])
if isinstance(lane, dict) and isinstance(lane.get("id"), str)
}
report = audit_registry_payload(
load_registry_payload(staged=args.staged),
tracked_files=tracked_workspace_files(active_repos=active_repos, local_repo=REPO_ROOT.name),
status_lane_ids=lane_ids,
schema_contract=registry_schema_contract(staged=args.staged),
)
output = render_pretty(report) if args.pretty else json.dumps(report, indent=2, sort_keys=True)
print(output)
if args.check and report["errors"]:
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))