compound-engineering-plugin…/tests/review-skill-contract.test.ts

import { readFile } from "fs/promises"
import path from "path"
import { describe, expect, test } from "bun:test"
import { parseFrontmatter } from "../src/utils/frontmatter"

async function readRepoFile(relativePath: string): Promise<string> {
  return readFile(path.join(process.cwd(), relativePath), "utf8")
}

describe("ce-code-review contract", () => {
  test("documents explicit modes and orchestration boundaries", async () => {
    const content = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")

    expect(content).toContain("## Mode Detection")
    expect(content).toContain("mode:autofix")
    expect(content).toContain("mode:report-only")
    expect(content).toContain("mode:headless")
    expect(content).toContain("/tmp/compound-engineering/ce-code-review/<run-id>/")
    expect(content).toContain("Do not write run artifacts.")
    expect(content).toContain(
      "Do not start a mutating review round concurrently with browser testing on the same checkout.",
    )
    expect(content).toContain("mode:report-only cannot switch the shared checkout to review a PR target")
    expect(content).toContain("mode:report-only cannot switch the shared checkout to review another branch")
    expect(content).toContain("Resolve the base ref from the PR's actual base repository, not by assuming `origin`")
    expect(content).not.toContain("Which severities should I fix?")
  })

  test("keeps plan requirements completeness compatible with current and legacy unit formats", async () => {
    const content = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")

    expect(content).toContain("current numeric subsections")
    expect(content).toContain("`### U1.`")
    expect(content).toContain("`### Unit 1:`")
    expect(content).toContain("legacy bullet or checkbox unit entries")
    expect(content).toContain("unaddressed requirements or implementation units")
  })

  test("documents headless mode contract for programmatic callers", async () => {
    const content = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")

    // Headless mode has its own rules section
    expect(content).toContain("### Headless mode rules")

    // No interactive prompts (cross-platform)
    expect(content).toContain(
      "Never use the platform question tool",
    )

    // Structured output format
    expect(content).toContain("### Headless output format")
    expect(content).toContain("Code review complete (headless mode).")
    expect(content).toContain('"Review complete" as the terminal signal')

    // Applies safe_auto fixes but NOT safe for concurrent use
    expect(content).toContain(
      "Not safe for concurrent use on a shared checkout.",
    )

    // Writes artifacts but no externalized work, no commit/push/PR
    expect(content).toContain("Do not file tickets or externalize work.")
    expect(content).toContain(
      "Never commit, push, or create a PR",
    )

    // Single-pass fixing, no bounded re-review rounds
    expect(content).toContain("No bounded re-review rounds")

    // Checkout guard — headless shares report-only's guard
    expect(content).toMatch(/mode:headless.*must run in an isolated checkout\/worktree or stop/)

    // Conflicting mode flags
    expect(content).toContain("**Conflicting mode flags:**")

    // Structured error for missing scope
    expect(content).toContain("Review failed (headless mode). Reason: no diff scope detected.")

    // Degraded signal when all reviewers fail
    expect(content).toContain("Code review degraded (headless mode).")
  })

  test("documents policy-driven routing and residual handoff", async () => {
    const content = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")

    // Routing taxonomy and fixer queue semantics
    expect(content).toContain("## Action Routing")
    expect(content).toContain("Only `safe_auto -> review-fixer` enters the in-skill fixer queue automatically.")

    // Interactive mode four-option routing structure: each distinguishing word must appear
    // as a routing-option label so truncation-safe menus stay intact.
    // Assert presence rather than exact copy — wording can be improved without breaking the test.
    expect(content).toMatch(/\(A\)\s*`Review each finding one by one/)
    expect(content).toMatch(/\(B\)\s*Auto-resolve with best judgment/)
    expect(content).toMatch(/\(C\)\s*`File a \[TRACKER\] ticket/)
    expect(content).toMatch(/\(D\)\s*`Report only/)

    // The new routing question dispatches to focused reference files, not inline prose.
    // bulk-preview.md is now invoked by option C only (the best-judgment path no longer uses it).
    expect(content).toContain("references/walkthrough.md")
    expect(content).toContain("references/bulk-preview.md")
    expect(content).toContain("references/tracker-defer.md")
    // Option C still references bulk-preview; option B does not.
    expect(content).toMatch(/\(C\)\s*`File a \[TRACKER\][^\n]*?references\/bulk-preview\.md/s)

    // Stem is third-person (AGENTS.md:127 — no first-person "I" / "me" in the new routing question).
    // The Interactive branch of After Review Step 2 must not reintroduce the removed bucket-policy wording.
    expect(content).not.toContain("What should I do with the remaining findings?")
    expect(content).not.toContain("What should I do?")

    // Zero-remaining case: routing question is skipped with a completion summary.
    expect(content).toMatch(/skip the routing question entirely/i)

    // Stage 5 tie-breaking rule — the walk-through's recommendation is deterministic.
    expect(content).toMatch(/Skip\s*>\s*Defer\s*>\s*Apply/)

    // Autofix-mode residual handoff is the run artifact (file-based todo system removed).
    expect(content).toContain(
      "In autofix mode, the run artifact is the handoff.",
    )
    expect(content).not.toContain("ce-todo-create")
    expect(content).not.toContain("create durable todo files")

    // Tracker fallback chain still exists for defer actions.
    const trackerDefer = await readRepoFile(
      "plugins/compound-engineering/skills/ce-code-review/references/tracker-defer.md",
    )
    expect(trackerDefer).toContain("Named tracker")
    expect(trackerDefer).toContain("GitHub Issues via `gh`")
    expect(trackerDefer).not.toContain(".context/compound-engineering/todos/")
    expect(content).not.toMatch(/harness task primitive|task-tracking primitive/)

    // Harness task-tracking primitive is no longer a fallback tier — it was removed
    // because in-session tasks do not meet the durable-filing intent of a Defer action.
    expect(trackerDefer).not.toMatch(/Harness task primitive \(last resort\)/)
    expect(trackerDefer).not.toMatch(/Once-per-session harness-fallback confirmation/)
    expect(trackerDefer).not.toMatch(/no-sink/)

    // Non-interactive execution mode exists for autonomous callers (e.g., lfg).
    expect(trackerDefer).toContain("## Execution Modes")
    expect(trackerDefer).toContain("Non-interactive mode")
    expect(trackerDefer).toMatch(/no_sink/)

    // Subagent template carries the why_it_matters framing guidance that replaces the
    // rejected synthesis-time rewrite pass. Assert presence of the observable-behavior
    // rule and the required-field reminder without pinning exact prose.
    const subagentTemplate = await readRepoFile(
      "plugins/compound-engineering/skills/ce-code-review/references/subagent-template.md",
    )
    expect(subagentTemplate).toMatch(/observable behavior/i)
    expect(subagentTemplate).toMatch(/required/i)

    // walkthrough.md carries the four per-finding option labels (Apply / Defer / Skip /
    // Auto-resolve with best judgment on the rest). Assert presence of each distinguishing
    // word so renaming an option breaks the test. Exact label wording may be refined for
    // clarity — these assertions check the structural contract, not the prose.
    const walkthrough = await readRepoFile(
      "plugins/compound-engineering/skills/ce-code-review/references/walkthrough.md",
    )
    expect(walkthrough).toContain("Apply the proposed fix")
    expect(walkthrough).toContain("Defer — file a [TRACKER] ticket")
    expect(walkthrough).toContain("Skip — don't apply, don't track")
    expect(walkthrough).toMatch(/Auto-resolve with best judgment on the rest/)

    // bulk-preview.md contract: exactly Proceed / Cancel, no third option.
    const bulkPreview = await readRepoFile(
      "plugins/compound-engineering/skills/ce-code-review/references/bulk-preview.md",
    )
    expect(bulkPreview).toContain("Proceed")
    expect(bulkPreview).toContain("Cancel")

    // Step 5 final-next-steps flow is gated on fixes-applied count, not routing option.
    expect(content).toContain("fixes_applied_count")
    expect(content).toMatch(/Step 5 runs only when `fixes_applied_count > 0`/i)

    // Final-next-steps wording preserved.
    expect(content).toContain("**On the resolved review base/default branch:**")
    expect(content).toContain("git push --set-upstream origin HEAD")
    expect(content).not.toContain("**On main/master:**")
  })

  test("keeps findings schema and downstream docs aligned", async () => {
    const rawSchema = await readRepoFile(
      "plugins/compound-engineering/skills/ce-code-review/references/findings-schema.json",
    )
    const schema = JSON.parse(rawSchema) as {
      _meta: {
        confidence_thresholds: { suppress: string; report: string }
        confidence_anchors: Record<string, string>
      }
      properties: {
        findings: {
          items: {
            properties: {
              autofix_class: { enum: string[] }
              owner: { enum: string[] }
              requires_verification: { type: string }
              confidence: { type: string; enum: number[] }
            }
            required: string[]
          }
        }
      }
    }

    expect(schema.properties.findings.items.required).toEqual(
      expect.arrayContaining(["autofix_class", "owner", "requires_verification"]),
    )
    expect(schema.properties.findings.items.properties.autofix_class.enum).toEqual([
      "safe_auto",
      "gated_auto",
      "manual",
      "advisory",
    ])
    expect(schema.properties.findings.items.properties.owner.enum).toEqual([
      "review-fixer",
      "downstream-resolver",
      "human",
      "release",
    ])
    expect(schema.properties.findings.items.properties.requires_verification.type).toBe("boolean")

    // Anchored confidence: integer enum, no floats
    expect(schema.properties.findings.items.properties.confidence.type).toBe("integer")
    expect(schema.properties.findings.items.properties.confidence.enum).toEqual([0, 25, 50, 75, 100])

    // Threshold: anchor 75 (P0 escape at anchor 50)
    expect(schema._meta.confidence_thresholds.suppress).toContain("anchor 75")
    expect(schema._meta.confidence_thresholds.suppress).toContain("anchor 50")
    expect(schema._meta.confidence_thresholds.suppress).toMatch(/P0/)

    // Behavioral anchors documented for personas
    expect(schema._meta.confidence_anchors).toBeDefined()
    expect(schema._meta.confidence_anchors["0"]).toBeDefined()
    expect(schema._meta.confidence_anchors["25"]).toBeDefined()
    expect(schema._meta.confidence_anchors["50"]).toBeDefined()
    expect(schema._meta.confidence_anchors["75"]).toBeDefined()
    expect(schema._meta.confidence_anchors["100"]).toBeDefined()

  })

  test("subagent template carries verbatim 5-anchor rubric and lint-ignore suppression", async () => {
    const template = await readRepoFile(
      "plugins/compound-engineering/skills/ce-code-review/references/subagent-template.md",
    )

    // Anchored rubric: each anchor named with behavioral criterion
    expect(template).toMatch(/`0`.*Not confident/)
    expect(template).toMatch(/`25`.*Somewhat confident/)
    expect(template).toMatch(/`50`.*Moderately confident/)
    expect(template).toMatch(/`75`.*Highly confident/)
    expect(template).toMatch(/`100`.*Absolutely certain/)

    // Schema conformance hard constraints reject floats
    expect(template).toContain("`0`, `25`, `50`, `75`, or `100`")
    expect(template).toMatch(/0\.85.*validation failure/i)

    // Lint-ignore rule in false-positive catalog
    expect(template).toMatch(/lint.ignore|lint disable|eslint-disable/i)
    expect(template).toMatch(/suppress unless the suppression itself violates/i)

    // Advisory routing rule preserved
    expect(template).toMatch(/Advisory observations.*route to advisory/i)

    // Personas never produce anchors 0 or 25 (suppress silently)
    expect(template).toMatch(/personas never produce/i)
  })

  test("autofix_class decision guide includes safe_auto operational test and boundary cases", async () => {
    const template = await readRepoFile(
      "plugins/compound-engineering/skills/ce-code-review/references/subagent-template.md",
    )

    // Symmetry-of-error framing: classifying a mechanical fix as gated_auto has cost
    expect(template).toMatch(/wrong-side cost is symmetric/i)
    expect(template).toMatch(/Bias toward `safe_auto`/i)

    // Operational test for safe_auto: one-sentence + no-contract-change exclusion list
    expect(template).toMatch(/one sentence with no .depends on. clauses/i)
    expect(template).toMatch(/function signature.*public-API.*error contract.*security posture.*permission model/i)

    // The four boundary cases that often feel risky but are still safe_auto
    expect(template).toMatch(/Boundary cases that often feel risky but are still `safe_auto`/i)
    expect(template).toMatch(/nil guard that turns a crash into a nil-return is `safe_auto`/i)
    expect(template).toMatch(/off-by-one fix is `safe_auto`/i)
    expect(template).toMatch(/Dead-code removal is `safe_auto`/i)
    expect(template).toMatch(/Helper extraction is `safe_auto`/i)

    // Cross-file extraction discriminator (the F4b case from the calibration eval)
    expect(template).toMatch(/naming or placement requires a design conversation/i)

    // Anti-default guards on both sides
    expect(template).toMatch(/Do not default to `advisory`/i)
    expect(template).toMatch(/Do not default to `gated_auto` when the fix is mechanical/i)
  })

  test("Stage 4 spawning restates model-override imperative at point of action", async () => {
    const content = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")

    // Model tiering subsection still enumerates the three session-model exceptions
    expect(content).toMatch(/ce-correctness-reviewer.*ce-security-reviewer.*ce-adversarial-reviewer/s)

    // Imperative lives inside the Spawning subsection, not only in the rationale block.
    // Extract the Spawning subsection and assert the model-override directive appears there
    // with cross-platform dispatch primitives named at the call site.
    const spawningMatch = content.match(/#### Spawning\n([\s\S]*?)(?=\n####|\n### )/)
    expect(spawningMatch).not.toBeNull()
    const spawning = spawningMatch![1]

    expect(spawning).toMatch(/Model override at dispatch time/)
    expect(spawning).toContain('model: "sonnet"')
    expect(spawning).toContain("Agent")
    expect(spawning).toContain("spawn_agent")
    expect(spawning).toContain("subagent")
    expect(spawning).toMatch(/Bounded parallel dispatch/)
    expect(spawning).toMatch(/active-subagent limit/)
    expect(spawning).toMatch(/spawn errors as backpressure, not reviewer failure/)
    expect(spawning).toMatch(/fill freed slots/)
    // Exceptions are restated at point of action so the agent does not have to recall them
    // from the Model tiering subsection above during a 12-agent parallel dispatch.
    expect(spawning).toContain("ce-correctness-reviewer")
    expect(spawning).toContain("ce-security-reviewer")
    expect(spawning).toContain("ce-adversarial-reviewer")
  })

  test("Stage 5 synthesis uses anchor gate and one-anchor promotion", async () => {
    const content = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")

    // Confidence value constraint is integer enum
    expect(content).toMatch(/confidence:\s*integer in \{0, 25, 50, 75, 100\}/)

    // Confidence gate at anchor 75 with P0 exception at 50
    expect(content).toMatch(/suppress remaining findings below anchor 75/i)
    expect(content).toMatch(/P0 findings at anchor 50\+ survive/)

    // Confidence gate runs AFTER dedup, promotion, and demotion so anchor-50 findings
    // can be promoted by cross-reviewer agreement or rerouted to soft buckets first.
    // This is a load-bearing ordering — if the gate runs early, promotion/demotion become unreachable.
    expect(content).toMatch(/gate runs late deliberately/i)

    // One-anchor promotion replaces +0.10 boost
    expect(content).toMatch(/one anchor step.*50 -> 75.*75 -> 100/)
    expect(content).not.toContain("boost the merged confidence by 0.10")

    // Sort by anchor descending, not "confidence (descending)"
    expect(content).toMatch(/anchor \(descending\)/)
  })

  test("Stage 5b validation pass dispatches conditionally and bounds parallelism", async () => {
    const content = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")
    const validatorTemplate = await readRepoFile(
      "plugins/compound-engineering/skills/ce-code-review/references/validator-template.md",
    )

    // Stage 5b exists between Stage 5 and Stage 6
    expect(content).toContain("### Stage 5b: Validation pass")

    // Mode-conditional dispatch — runs on autofix/headless/option C; explicitly does NOT
    // run on the best-judgment path (option B and walk-through's auto-resolve-the-rest).
    expect(content).toContain("`headless`")
    expect(content).toContain("`autofix`")
    expect(content).toContain("walk-through routing (option A)")
    expect(content).toContain("best-judgment routing (option B)")
    expect(content).toContain("File-tickets routing (option C)")
    expect(content).toMatch(/Report-only routing.*nothing is being externalized/i)

    // Best-judgment path explicitly skips Stage 5b — the fixer's apply/fail outcome is the validation.
    expect(content).toMatch(/best-judgment routing \(option B\) \| No --/)
    expect(content).toMatch(/best-judgment-the-rest handoff \| No --/)
    expect(content).toMatch(/best-judgment path skips Stage 5b deliberately/i)

    // Per-finding bounded dispatch (not batched)
    expect(content).toMatch(/per.finding bounded dispatch/i)
    expect(content).toMatch(/Independence is the point/i)
    expect(content).toMatch(/same bounded scheduler from Stage 4/i)
    expect(content).toMatch(/active-subagent limit/i)

    // Budget cap of 15
    expect(content).toMatch(/exceeds 15 findings/i)
    expect(content).toMatch(/highest-severity 15.*Drop the remainder/i)

    // Option C invokes validation before externalizing (option B no longer does).
    expect(content).toMatch(/\(C\)\s*`File a \[TRACKER\].*first run Stage 5b validation/)
    expect(content).not.toMatch(/\(B\).*first run Stage 5b validation/)

    // Option B dispatches the fixer immediately — no Stage 5b, no bulk-preview.
    expect(content).toMatch(/\(B\)\s*`Auto-resolve with best judgment.*dispatch the fixer subagent.*immediately/i)
    expect(content).toMatch(/No Stage 5b validator pre-pass/i)
    expect(content).toMatch(/No bulk-preview approval gate/i)

    // Validator template exists and is read-only
    expect(validatorTemplate).toContain("independent validator")
    expect(validatorTemplate).toContain("operationally read-only")
    expect(validatorTemplate).toContain('"validated": true | false')
    expect(validatorTemplate).toMatch(/introduced by THIS diff/i)
    expect(validatorTemplate).toMatch(/handled elsewhere/i)
  })

  test("best-judgment path post-run failure-handling question fires only when failed bucket non-empty", async () => {
    const content = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")

    // Post-run question fires when the fixer's `failed` bucket is non-empty.
    expect(content).toMatch(/N findings could not be auto-resolved/)
    expect(content).toContain("File tickets for these")
    expect(content).toContain("Walk through these one at a time")
    expect(content).toContain("Ignore — leave them in the report")

    // Sink-availability rule mirrors tracker-defer.md: omit file-tickets when no sink.
    expect(content).toMatch(/Omit this option when.*any_sink_available\s*=\s*false/i)
  })

  test("fixer subagent contract supports heterogeneous best-judgment queue", async () => {
    const content = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")

    // Step 3 documents both queue shapes: homogeneous (autofix/headless/walk-through Apply)
    // and heterogeneous (best-judgment path with gated_auto + manual + advisory).
    expect(content).toMatch(/Heterogeneous queue/i)
    expect(content).toMatch(/`gated_auto`,\s*`manual`,\s*and\s*`advisory`/i)

    // Fixer routes items by class with explicit reason taxonomy for the failed bucket.
    expect(content).toMatch(/no fix proposed by reviewer/i)
    expect(content).toMatch(/evidence no longer matches code/i)
    expect(content).toMatch(/fix did not apply cleanly/i)

    // Best-judgment path is single-pass; bounded re-review applies to autofix and walk-through Apply.
    expect(content).toMatch(/Best-judgment path is single-pass/i)
    expect(content).toMatch(/max_rounds:\s*2/)

    // Fixer return shape includes the {applied, failed, advisory} partition.
    expect(content).toMatch(/\{applied,\s*failed,\s*advisory\}/)
  })

  test("PR-mode skip-condition pre-check stops without dispatching reviewers", async () => {
    const content = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")

    // Skip-check section exists
    expect(content).toContain("**Skip-condition pre-check.**")

    // gh pr view fetches state and file list for trivial judgment
    expect(content).toMatch(/gh pr view.*--json state,title,body,files/)

    // Hard skip rules
    expect(content).toMatch(/state.*CLOSED.*MERGED/)

    // Draft PRs are explicitly NOT skipped
    expect(content).not.toMatch(/isDraft.*true.*stop/)
    expect(content).toMatch(/Draft PRs are reviewed normally/)

    // Trivial-PR judgment uses lightweight model, not a regex
    expect(content).toMatch(/lightweight sub-agent/)
    expect(content).toMatch(/model.*haiku/i)
    expect(content).not.toMatch(/chore\\?\(deps\\?\)/)

    // Skip cleanly without dispatching reviewers
    expect(content).toMatch(/stop without dispatching reviewers/)

    // Standalone branch and base: modes unaffected
    expect(content).toMatch(/Standalone branch mode and `base:` mode are unaffected/)
  })

  test("mode-aware demotion routes weak general-quality findings to soft buckets", async () => {
    const content = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")

    // Mode-aware demotion step exists (sub-step within Stage 5; numbering may shift if steps reorder)
    expect(content).toMatch(/Mode-aware demotion of weak general-quality findings/i)

    // Conservative scope: testing + maintainability personas only
    expect(content).toContain("`testing` or `maintainability`")

    // Severity P2 or P3 only (P0/P1 always stay primary)
    expect(content).toMatch(/Severity is P2 or P3/)

    // autofix_class is advisory
    expect(content).toMatch(/`autofix_class` is `advisory`/)

    // Interactive/report-only: route to testing_gaps or residual_risks
    expect(content).toMatch(/`testing`,?\s*append.*`testing_gaps`/)
    expect(content).toMatch(/`maintainability`,?\s*append.*`residual_risks`/)

    // Demotion entry uses title-only (compact return omits why_it_matters; report-only has no artifact)
    expect(content).toMatch(/append `<file:line> -- <title>` to/)
    expect(content).toMatch(/title only.*compact return omits/i)

    // Headless/autofix: suppress entirely
    expect(content).toMatch(/Headless and autofix modes.*Suppress/)

    // Coverage section reports demotion count
    expect(content).toMatch(/mode-aware demotion/)
  })

  test("personas use anchored rubric language and no float references remain", async () => {
    const personas = [
      "ce-correctness-reviewer",
      "ce-testing-reviewer",
      "ce-maintainability-reviewer",
      "ce-project-standards-reviewer",
      "ce-security-reviewer",
      "ce-performance-reviewer",
      "ce-api-contract-reviewer",
      "ce-data-migration-reviewer",
      "ce-reliability-reviewer",
      "ce-adversarial-reviewer",
      "ce-previous-comments-reviewer",
      "ce-julik-frontend-races-reviewer",
      "ce-swift-ios-reviewer",
      "ce-agent-native-reviewer",
    ]

    for (const persona of personas) {
      const content = await readRepoFile(`plugins/compound-engineering/agents/${persona}.md`)

      // Anchored language appears
      expect(content).toMatch(/Anchor (75|100)/)
      expect(content).toMatch(/Anchor 25 or below.*suppress/i)

      // No float confidence references
      expect(content).not.toMatch(/0\.\d{2}\+/)
      expect(content).not.toMatch(/0\.60-0\.79/)
      expect(content).not.toMatch(/below 0\.60/)
    }
  })

  test("documents stack-specific conditional reviewers for the JSON pipeline", async () => {
    const content = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")
    const catalog = await readRepoFile(
      "plugins/compound-engineering/skills/ce-code-review/references/persona-catalog.md",
    )

    for (const agent of ["ce-julik-frontend-races-reviewer", "ce-swift-ios-reviewer"]) {
      expect(content).toContain(agent)
      expect(catalog).toContain(agent)
    }

    for (const removed of [
      "ce-dhh-rails-reviewer",
      "ce-kieran-rails-reviewer",
      "ce-kieran-python-reviewer",
      "ce-kieran-typescript-reviewer",
    ]) {
      expect(content).not.toContain(removed)
      expect(catalog).not.toContain(removed)
    }

    expect(content).toContain("## Language-Aware Conditionals")
    expect(content).not.toContain("## Language-Agnostic")
  })

  test("stack-specific reviewer agents follow the structured findings contract", async () => {
    const reviewers = [
      {
        path: "plugins/compound-engineering/agents/ce-julik-frontend-races-reviewer.md",
        reviewer: "julik-frontend-races",
      },
      {
        path: "plugins/compound-engineering/agents/ce-swift-ios-reviewer.md",
        reviewer: "swift-ios",
      },
    ]

    for (const reviewer of reviewers) {
      const content = await readRepoFile(reviewer.path)
      const parsed = parseFrontmatter(content)
      const tools = String(parsed.data.tools ?? "")

      expect(String(parsed.data.description)).toContain("Conditional code-review persona")
      expect(tools).toContain("Read")
      expect(tools).toContain("Grep")
      expect(tools).toContain("Glob")
      expect(tools).toContain("Bash")
      expect(content).toContain("## Confidence calibration")
      expect(content).toContain("## What you don't flag")
      expect(content).toContain("Return your findings as JSON matching the findings schema. No prose outside the JSON.")
      expect(content).toContain(`"reviewer": "${reviewer.reviewer}"`)
    }
  })

  test("JSON-pipeline persona agents grant Write so they can save run artifacts", async () => {
    // The ce-code-review subagent template instructs each persona to write its full
    // analysis to /tmp/compound-engineering/ce-code-review/{run_id}/{reviewer}.json.
    // Without Write in tools, that "one permitted write" cannot happen and headless
    // detail enrichment loses its Why:/Evidence: source. See issue #733.
    const personas = [
      "ce-correctness-reviewer",
      "ce-testing-reviewer",
      "ce-maintainability-reviewer",
      "ce-project-standards-reviewer",
      "ce-security-reviewer",
      "ce-performance-reviewer",
      "ce-api-contract-reviewer",
      "ce-data-migration-reviewer",
      "ce-reliability-reviewer",
      "ce-adversarial-reviewer",
      "ce-previous-comments-reviewer",
      "ce-julik-frontend-races-reviewer",
      "ce-swift-ios-reviewer",
    ]

    for (const persona of personas) {
      const content = await readRepoFile(`plugins/compound-engineering/agents/${persona}.md`)
      const parsed = parseFrontmatter(content)
      const tools = String(parsed.data.tools ?? "")

      expect(tools).toContain("Write")
    }
  })

  test("data-migration reviewer consolidates schema drift and migration safety", async () => {
    const content = await readRepoFile(
      "plugins/compound-engineering/agents/ce-data-migration-reviewer.md",
    )
    const skill = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")

    expect(content).toContain("## Step 0: Schema drift")
    expect(content).toContain('"reviewer": "data-migration"')
    expect(content).toContain("Return your findings as JSON matching the findings schema.")
    expect(skill).toContain("data-migration` spawn gate")
    expect(skill).not.toContain("ce-schema-drift-detector")
    expect(skill).not.toContain("ce-data-migration-expert")
    expect(skill).not.toContain("ce-data-migrations-reviewer")
  })

  test("fails closed when merge-base is unresolved instead of falling back to git diff HEAD", async () => {
    const content = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")

    // No scope path should fall back to `git diff HEAD` or `git diff --cached` — those only
    // show uncommitted changes and silently produce empty diffs on clean feature branches.
    expect(content).not.toContain("git diff --name-only HEAD")
    expect(content).not.toContain("git diff -U10 HEAD")
    expect(content).not.toContain("git diff --cached")

    // PR mode still has an inline error for unresolved base
    expect(content).toContain('echo "ERROR: Unable to resolve PR base branch')

    // Branch and standalone modes must stop when no base can be resolved, not fall back to
    // `git diff HEAD`. The guard phrase appears once per mode (branch + standalone).
    const stopGuardMatches = content.match(/Do not fall back to `git diff HEAD`/g)
    expect(stopGuardMatches?.length).toBeGreaterThanOrEqual(2)
  })

  test("orchestration callers pass explicit mode flags", async () => {
    const lfg = await readRepoFile("plugins/compound-engineering/skills/lfg/SKILL.md")
    expect(lfg).toMatch(/ce-code-review[^\n]*mode:autofix/)
  })

  test("ce-work shipping-workflow enforces a residual-work gate after Tier 2 review", async () => {
    for (const path of [
      "plugins/compound-engineering/skills/ce-work/references/shipping-workflow.md",
      "plugins/compound-engineering/skills/ce-work-beta/references/shipping-workflow.md",
    ]) {
      const workflow = await readRepoFile(path)
      await expect(readRepoFile(path.replace("shipping-workflow.md", "tracker-defer.md"))).resolves.toContain(
        "Non-interactive mode",
      )
      await expect(readRepoFile(path.replace("shipping-workflow.md", "tracker-defer.md"))).resolves.not.toMatch(
        /no-sink/,
      )

      // Gate step is explicitly labeled and required after Tier 2.
      expect(workflow).toContain("**Residual Work Gate**")
      expect(workflow).toMatch(/do not proceed to Final Validation/i)

      // Three forward options + one abort; labels are self-contained.
      expect(workflow).toContain("Apply/fix now")
      expect(workflow).toContain("File tickets via project tracker")
      expect(workflow).toContain("Accept and proceed")
      expect(workflow).toContain("Stop — do not ship")

      // Accept-and-proceed path threads findings into the PR description.
      expect(workflow).toContain("Known Residuals")
      expect(workflow).toContain("docs/residual-review-findings/<branch-or-head-sha>.md")
      expect(workflow).toContain("If the user later chooses the no-PR `ce-commit` path")
      expect(workflow).toContain("must not live only in the transient session")
    }
  })

  test("lfg autonomously handles residuals via non-interactive tracker-defer and PR description", async () => {
    const lfg = await readRepoFile("plugins/compound-engineering/skills/lfg/SKILL.md")
    await expect(readRepoFile("plugins/compound-engineering/skills/lfg/references/tracker-defer.md")).resolves.toContain(
      "Non-interactive mode",
    )
    await expect(readRepoFile("plugins/compound-engineering/skills/lfg/references/tracker-defer.md")).resolves.not.toMatch(
      /no-sink/,
    )

    // Autonomous residual handoff step exists between code review and test-browser.
    expect(lfg).toContain("Persist review autofixes")
    expect(lfg).toContain("fix(review): apply autofix feedback")
    expect(lfg).toContain("Do not proceed to step 5, run browser tests, or output DONE while review autofix edits remain only in the working tree.")
    expect(lfg).toContain("there were no review autofixes to persist")
    expect(lfg).toContain("Autonomous residual handoff")
    expect(lfg).toMatch(/Do not prompt the user/)

    // tracker-defer is invoked in non-interactive mode.
    expect(lfg).toContain("references/tracker-defer.md")
    expect(lfg).not.toContain("plugins/compound-engineering/skills/ce-code-review/references/tracker-defer.md")
    expect(lfg).toMatch(/non-interactive mode/)

    // Structured return buckets drive PR description content.
    expect(lfg).toMatch(/filed/)
    expect(lfg).toMatch(/failed/)
    expect(lfg).toMatch(/no_sink/)

    // PR description update path is non-interactive and does not route through
    // confirmation-driven PR update skills. The positive assertion on
    // `gh pr edit` below is the actual check; a broad `not.toContain` would
    // falsely trip on step 7's legitimate use of ce-commit-push-pr for the
    // post-work commit/PR-open step.
    expect(lfg).toContain("do not load any confirmation-driven PR update skill")
    expect(lfg).toContain("gh pr edit PR_NUMBER --body-file BODY_FILE")
    expect(lfg).toContain("## Residual Review Findings")
    expect(lfg).toContain("docs/residual-review-findings/<branch-or-head-sha>.md")
    expect(lfg).toContain("prefer `origin` when present")
    expect(lfg).toContain("choose the first configured remote")
    expect(lfg).toContain("git push --set-upstream <remote> HEAD")
    expect(lfg).not.toContain("git push --set-upstream origin HEAD")
    expect(lfg).toContain("Do not output DONE until either the existing PR body has been updated or this fallback file commit has been pushed.")

    // Autopilot contract: never prompt, but require a durable sink before DONE.
    expect(lfg).toContain("Do not prompt the user")
    expect(lfg).toMatch(/Never block DONE on tracker filing failures/i)
  })

  test("ce-code-review autofix emits a residual-work summary in-chat, not only in the artifact", async () => {
    const content = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")
    expect(content).toMatch(/Emit a compact Residual Actionable Work summary/)
    expect(content).toContain("with its stable `#`, severity, file:line, title, and autofix_class")
    expect(content).toContain("Structure the summary as two separate contiguous sections")
    expect(content).toContain("applied `safe_auto` fixes first, then residual non-auto findings")
    expect(content).toContain("reuse each finding's stable `#` from Stage 5 -- never renumber")
    expect(content).toContain("Residual actionable work: none.")
  })

  test("ce-code-review uses stable sequential finding numbers across grouped output", async () => {
    const content = await readRepoFile("plugins/compound-engineering/skills/ce-code-review/SKILL.md")
    const template = await readRepoFile(
      "plugins/compound-engineering/skills/ce-code-review/references/review-output-template.md",
    )
    const fixture = await readRepoFile("tests/fixtures/ce-code-review-stable-numbering.md")

    const stage5 = content.split("### Stage 5b:")[0].split("### Stage 5:")[1]
    expect(stage5).toMatch(/Sort and number/)
    expect(stage5).toMatch(/Do not restart numbering inside each severity table or autofix\/routing bucket/)
    expect(stage5).toMatch(/reuse the same stable `#`/)
    expect(stage5).toMatch(/ce-resolve-pr-feedback/)

    const stage6 = content.split("### Headless output format")[0].split("### Stage 6: Synthesize and present")[1]
    expect(stage6).toContain("Finding numbers come from the stable assignment in Stage 5")
    expect(stage6).toContain("never re-derive them per severity table")
    expect(template).toContain("Stable sequential finding numbers")
    expect(template).toContain("reuse those same numbers when findings are repeated in Residual Actionable Work")

    const primaryFindingIds = Array.from(
      fixture.matchAll(/^\| (\d+) \| `[^`]+` \| .* \| .* \| \d+ \| `.*` \|$/gm),
      ([, id]) => Number(id),
    )
    expect(primaryFindingIds).toEqual([1, 2, 3])

    const residualSection = fixture.split("### Residual Actionable Work")[1]
    const residualIds = Array.from(
      residualSection.matchAll(/^\| (\d+) \| `[^`]+` \| .* \| `.*` \| .* \|$/gm),
      ([, id]) => Number(id),
    )
    expect(residualIds).toEqual([2, 3])
    expect(residualIds.every((id) => primaryFindingIds.includes(id))).toBe(true)
  })
})

describe("testing-reviewer contract", () => {
  test("includes behavioral-changes-with-no-test-additions check", async () => {
    const content = await readRepoFile("plugins/compound-engineering/agents/ce-testing-reviewer.md")

    // New check exists in "What you're hunting for" section
    expect(content).toContain("Behavioral changes with no test additions")

    // Check is distinct from untested branches check
    expect(content).toContain("distinct from untested branches")

    // Non-behavioral changes are excluded
    expect(content).toContain("Non-behavioral changes")
  })
})