Files
compound-engineering-plugin…/plugins/compound-engineering/skills/ce-compound-refresh/scripts/validate-frontmatter.py

138 lines
4.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""Validate ce-compound docs/solutions/ frontmatter for parser-safety issues.
Usage:
python3 validate-frontmatter.py <doc-path>
Exit codes:
0 — frontmatter passes all checks
1 — validation failure (diagnostics on stderr)
2 — usage error (bad arguments, missing file)
Scope: this script catches *parser-safety* issues — frontmatter that strict
YAML parsers will silently misread. It does NOT validate against the
schema's required-field or enum-value rules; that's a separate concern. The
intent is to prevent the silent-data-loss bug class where YAML's quoting
rules truncate or reframe scalar values without raising.
Checks (regex-based, no YAML parser dependency):
1. File starts and ends frontmatter with `---` lines (matched as full
lines, not substrings — `----` and `---extra` are rejected)
2. No top-level scalar value contains ` #` unquoted (silent comment
truncation — what Codex caught on PR #695)
3. No top-level scalar value contains `: ` unquoted (mapping confusion —
what surfaced in a 2026-04-16 plan doc's `title:` field)
The script does NOT flag values starting with YAML reserved indicators
(`` ` ``, `*`, `&`, `!`, etc.) because those produce loud parser errors
downstream rather than silent corruption — they're already caught by
whatever consumes the doc. This validator's purpose is silent-corruption
prevention, not lint.
Pure-stdlib (no PyYAML or other third-party deps). Runs in <50ms typical.
Designed to produce concrete, actionable error messages so the calling
agent can fix and retry without ambiguity.
"""
import os
import re
import sys
def usage_fail(msg: str) -> "NoReturn":
sys.stderr.write(f"validate-frontmatter: {msg}\n")
sys.exit(2)
def main(argv: list[str]) -> int:
if len(argv) != 2:
usage_fail(f"usage: {os.path.basename(argv[0])} <doc-path>")
doc_path = argv[1]
if not os.path.isfile(doc_path):
usage_fail(f"file not found: {doc_path}")
with open(doc_path) as f:
text = f.read()
issues: list[str] = []
# Check 1: frontmatter delimiters. Match the delimiter as a complete
# line whose stripped content is exactly `---` — substring matching
# (e.g. `text.find("\n---", 4)`) would falsely accept `----` or
# `---extra` as a terminator and let malformed docs slip through to
# downstream parsers that require a strict `---` line.
lines = text.split("\n")
if not lines or lines[0].rstrip() != "---":
sys.stderr.write(
f"FAIL: {doc_path}\n"
f" file does not start with '---' frontmatter delimiter line\n"
)
return 1
end_idx: int | None = None
for i in range(1, len(lines)):
if lines[i].rstrip() == "---":
end_idx = i
break
if end_idx is None:
sys.stderr.write(
f"FAIL: {doc_path}\n"
f" frontmatter not closed (no '---' line after the opening delimiter)\n"
)
return 1
fm_text = "\n".join(lines[1:end_idx])
# Checks 2 & 3: silent-corruption quoting risks on top-level scalar
# fields. We scan line-by-line and only flag top-level mapping entries
# (no leading whitespace) whose value isn't already quoted/structured.
for lineno, line in enumerate(fm_text.split("\n"), start=2):
stripped = line.lstrip()
if not stripped or stripped.startswith("#"):
continue
if ":" not in line:
continue
# Top-level mapping keys only — skip nested values, array items
if line.startswith((" ", "\t")):
continue
# Skip pure list-marker lines like "- item" (these can't be top-level
# in our frontmatter convention, but be defensive)
if stripped.startswith("- "):
continue
key, _, val = line.partition(":")
val_stripped = val.strip()
if not val_stripped:
# Key with no value on this line — likely a parent of a nested
# block (`tags:` followed by `- foo`). Nothing to validate here.
continue
# Already quoted or structured (block scalar, flow collection)
if val_stripped[0] in '"\'[{|>':
continue
if re.search(r"\s#", val_stripped):
issues.append(
f"line {lineno}: '{key.strip()}' value contains ' #' — quote it. "
"YAML treats space-then-# as a comment delimiter and silently "
"drops the rest of the value."
)
if re.search(r":\s", val_stripped):
issues.append(
f"line {lineno}: '{key.strip()}' value contains ': ' — quote it. "
"Strict YAML parsers may treat this as a nested mapping."
)
if issues:
sys.stderr.write(f"FAIL: {doc_path}\n")
for issue in issues:
sys.stderr.write(f" {issue}\n")
return 1
print(f"OK: {doc_path}")
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv))