Initial import from garrytan/gstack@026751e (main snapshot via local relay)

Source: https://github.com/garrytan/gstack/commit/026751e
2026-05-19 21:18:17 +02:00
commit 834c6db075
797 changed files with 267839 additions and 0 deletions
--- a/test/fixtures/coverage-audit-fixture.ts
+++ b/test/fixtures/coverage-audit-fixture.ts
@@ -0,0 +1,76 @@
+/**
+ * Shared fixture for test coverage audit E2E tests.
+ *
+ * Creates a Node.js project with billing source code that has intentional
+ * test coverage gaps: processPayment has happy-path-only tests,
+ * refundPayment has no tests at all.
+ *
+ * Used by: ship-coverage-audit E2E, review-coverage-audit E2E
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import { spawnSync } from 'child_process';
+
+export function createCoverageAuditFixture(dir: string): void {
+  // Create a Node.js project WITH test framework but coverage gaps
+  fs.writeFileSync(path.join(dir, 'package.json'), JSON.stringify({
+    name: 'test-coverage-app',
+    version: '1.0.0',
+    type: 'module',
+    scripts: { test: 'echo "no tests yet"' },
+    devDependencies: { vitest: '^1.0.0' },
+  }, null, 2));
+
+  // Create vitest config
+  fs.writeFileSync(path.join(dir, 'vitest.config.ts'),
+    `import { defineConfig } from 'vitest/config';\nexport default defineConfig({ test: {} });\n`);
+
+  fs.writeFileSync(path.join(dir, 'VERSION'), '0.1.0.0\n');
+  fs.writeFileSync(path.join(dir, 'CHANGELOG.md'), '# Changelog\n');
+
+  // Create source file with multiple code paths
+  fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
+  fs.writeFileSync(path.join(dir, 'src', 'billing.ts'), `
+export function processPayment(amount: number, currency: string) {
+  if (amount <= 0) throw new Error('Invalid amount');
+  if (currency !== 'USD' && currency !== 'EUR') throw new Error('Unsupported currency');
+  return { status: 'success', amount, currency };
+}
+
+export function refundPayment(paymentId: string, reason: string) {
+  if (!paymentId) throw new Error('Payment ID required');
+  if (!reason) throw new Error('Reason required');
+  return { status: 'refunded', paymentId, reason };
+}
+`);
+
+  // Create a test directory with ONE test (partial coverage)
+  fs.mkdirSync(path.join(dir, 'test'), { recursive: true });
+  fs.writeFileSync(path.join(dir, 'test', 'billing.test.ts'), `
+import { describe, test, expect } from 'vitest';
+import { processPayment } from '../src/billing';
+
+describe('processPayment', () => {
+  test('processes valid payment', () => {
+    const result = processPayment(100, 'USD');
+    expect(result.status).toBe('success');
+  });
+  // GAP: no test for invalid amount
+  // GAP: no test for unsupported currency
+  // GAP: refundPayment not tested at all
+});
+`);
+
+  // Init git repo with main branch
+  const run = (cmd: string, args: string[]) =>
+    spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
+  run('git', ['init', '-b', 'main']);
+  run('git', ['config', 'user.email', 'test@test.com']);
+  run('git', ['config', 'user.name', 'Test']);
+  run('git', ['add', '.']);
+  run('git', ['commit', '-m', 'initial commit']);
+
+  // Create feature branch
+  run('git', ['checkout', '-b', 'feature/billing']);
+}
--- a/test/fixtures/eval-baselines.json
+++ b/test/fixtures/eval-baselines.json
@@ -0,0 +1,7 @@
+{
+  "command_reference": { "clarity": 4, "completeness": 3, "actionability": 4 },
+  "snapshot_flags": { "clarity": 4, "completeness": 4, "actionability": 4 },
+  "browse_skill": { "clarity": 4, "completeness": 4, "actionability": 4 },
+  "qa_workflow": { "clarity": 4, "completeness": 4, "actionability": 4 },
+  "qa_health_rubric": { "clarity": 4, "completeness": 3, "actionability": 4 }
+}
--- a/test/fixtures/forcing-finding-seeds.ts
+++ b/test/fixtures/forcing-finding-seeds.ts
@@ -0,0 +1,122 @@
+/**
+ * Per-skill draft-plan seeds engineered to surface at least one
+ * review-phase finding in the corresponding plan-* review skill.
+ *
+ * Used by gate-tier finding-floor tests
+ * (test/skill-e2e-plan-{eng,ceo,design,devex}-finding-floor.test.ts) as
+ * the minimum-cost regression for the May 2026 transcript bug:
+ *   "/plan-eng-review reviewed a real PR diff, wrote a multi-section
+ *    review plan to ~/.claude/plans/ and called ExitPlanMode without
+ *    ever firing AskUserQuestion."
+ *
+ * Each seed is small and pre-loaded with one obvious finding the
+ * matching skill cannot honestly miss. Floor tests assert
+ * `reviewCount >= 1` — i.e., the model fired at least one review-phase
+ * AUQ before reaching plan_ready / completion_summary / ceiling.
+ *
+ * Each seed includes the standard "write your plan-mode plan to /tmp/…"
+ * preamble that the existing periodic finding-count fixtures use, so
+ * the agent has a concrete plan-file target. The /tmp path is unique
+ * per skill to avoid collisions if floor tests run in parallel.
+ *
+ * For a deeper [N-1, N+2] count band assertion, see the periodic
+ * test/skill-e2e-plan-{X}-finding-count.test.ts fixtures.
+ */
+
+export const FORCING_FLOOR_ENG = [
+  'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-eng-floor.md (use Edit/Write to that exact path).',
+  '',
+  '# Plan: Add request-id propagation across services',
+  '',
+  '## Architecture',
+  "We'll roll a custom UUIDv7 generator inline in each service rather than",
+  "use Node's crypto.randomUUID() built-in. Same shape, but we want full",
+  'control over the entropy source for "future flexibility" — no concrete',
+  'reason yet.',
+].join('\n');
+
+export const FORCING_FLOOR_CEO = [
+  'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-ceo-floor.md (use Edit/Write to that exact path).',
+  '',
+  '# Plan: Launch a "developer-friendly" pricing tier',
+  '',
+  '## Goal',
+  'Increase developer adoption.',
+  '',
+  '## Success metric',
+  'More signups.',
+  '',
+  '## Premise',
+  "We haven't talked to any developers about whether the current pricing",
+  'is actually a barrier. The team agreed it "feels like" it should be cheaper.',
+].join('\n');
+
+export const FORCING_FLOOR_DESIGN = [
+  'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-design-floor.md (use Edit/Write to that exact path).',
+  '',
+  '# Plan: Marketing landing page',
+  '',
+  '## Layout',
+  'All headings, taglines, and body copy will be center-aligned for a',
+  '"clean modern look." The hero h1 sits 8px above the subhead with no',
+  'breathing room; the CTA button is the same visual weight as a',
+  'secondary "Learn more" link directly beside it.',
+].join('\n');
+
+export const FORCING_FLOOR_DEVEX = [
+  'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-devex-floor.md (use Edit/Write to that exact path).',
+  '',
+  '# Plan: SDK quickstart docs',
+  '',
+  '## Onboarding flow',
+  'Step 1: clone the repo.',
+  'Step 2: install bun manually if not present.',
+  'Step 3: copy .env.example to .env and fill in 8 environment variables.',
+  'Step 4: run database migrations against your local Postgres.',
+  'Step 5: start the dev server.',
+  'Step 6: open the docs in a separate tab.',
+  'Step 7: register an API key by emailing the team.',
+  'Step 8: paste the key into your .env, restart the server, then make',
+  'your first SDK call.',
+  '',
+  'No quickstart command, no hosted sandbox, no copy-pasteable curl example.',
+].join('\n');
+
+/**
+ * Multi-finding batching regression seed (periodic tier).
+ *
+ * Mirrors the May 2026 transcript bug shape: 4 distinct non-trivial findings
+ * spread across plan-eng-review's standard sections (Architecture, Code
+ * Quality, Tests, Performance). Each finding is independent — there is no
+ * legitimate reason to batch them into a single AskUserQuestion.
+ *
+ * Used by test/skill-e2e-plan-eng-multi-finding-batching.test.ts to assert
+ * the agent fires >= 3 review-phase AUQs (i.e., does NOT batch them into a
+ * "## Decisions to confirm" section + ExitPlanMode). Floor of 3 (not 4) is
+ * the [N-1] tolerance from the existing finding-count band convention.
+ */
+export const FORCING_BATCHING_ENG = [
+  'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-eng-batching.md (use Edit/Write to that exact path).',
+  '',
+  '# Plan: Add background job retry framework',
+  '',
+  '## Architecture',
+  "We'll roll a custom exponential-backoff scheduler inline in each worker",
+  "rather than use the existing job library's built-in retry hooks. Same",
+  'shape as the library version, but we want full control over the curve.',
+  '',
+  '## Code quality',
+  'The retry envelope (compute delay, log attempt, dispatch) is duplicated',
+  'across 5 worker files with copy-pasted bodies. We will leave the',
+  'duplication for now and refactor "later."',
+  '',
+  '## Tests',
+  'The existing `processWebhookJob()` flow gets rewritten as part of this',
+  'change. No regression test for the prior at-most-once delivery guarantee',
+  'is planned.',
+  '',
+  '## Performance',
+  'On every retry we re-fetch the full job payload from the database, then',
+  'iterate the payload to recompute the dependency graph. Could cache the',
+  'graph on the first attempt; not planned.',
+].join('\n');
--- a/test/fixtures/golden-ship-claude.md
+++ b/test/fixtures/golden-ship-claude.md
--- a/test/fixtures/golden/claude-ship-SKILL.md
+++ b/test/fixtures/golden/claude-ship-SKILL.md
--- a/test/fixtures/golden/codex-ship-SKILL.md
+++ b/test/fixtures/golden/codex-ship-SKILL.md
--- a/test/fixtures/golden/factory-ship-SKILL.md
+++ b/test/fixtures/golden/factory-ship-SKILL.md
--- a/test/fixtures/mode-posture/builder-idea.md
+++ b/test/fixtures/mode-posture/builder-idea.md
@@ -0,0 +1,15 @@
+# Weekend Project: Dependency Graph Visualizer
+
+I want to build a tool that takes a codebase and visualizes its dependency graph — modules, imports, which files depend on which. For fun, for learning. Maybe open-source it.
+
+## What I have so far
+
+- Rough idea: point it at a repo, get an interactive graph
+- Stack I'm leaning toward: TypeScript + D3 or Cytoscape for rendering
+- Potential: could work for JS/TS first, maybe Python later
+
+## What I don't know yet
+
+- How to make the visualization actually useful vs just pretty
+- Whether this should be a CLI, a web tool, or a VS Code extension
+- What would make someone else want to use it
--- a/test/fixtures/mode-posture/expansion-plan.md
+++ b/test/fixtures/mode-posture/expansion-plan.md
@@ -0,0 +1,23 @@
+# Plan: Team Velocity Dashboard
+
+## Context
+
+We're building a dashboard for engineering managers to track team code velocity — commits per engineer, PR cycle time, review latency, CI pass rate. The data already lives in GitHub; we're just aggregating it for a manager's single-pane view.
+
+## Changes
+
+1. New React component `TeamVelocityDashboard` in `src/dashboard/`
+2. REST API endpoint `GET /api/team/velocity?days=30` returning aggregated metrics
+3. Background job pulling GitHub data every 15 minutes into Postgres
+4. Simple filter UI: team, date range, metric
+
+## Architecture
+
+- Frontend: React + shadcn/ui
+- Backend: Express + PostgreSQL
+- Data source: GitHub REST API (cached 15min)
+
+## Open questions
+
+- Should we support multiple repos per team?
+- Do we show individual engineer names or aggregate only?
--- a/test/fixtures/mode-posture/forcing-pitch.md
+++ b/test/fixtures/mode-posture/forcing-pitch.md
@@ -0,0 +1,13 @@
+# Our Idea: AI Tools for Product Managers
+
+We're building AI tools for product managers at mid-market SaaS companies. The product combines a bunch of the things PMs already do — writing PRDs, gathering user feedback, analyzing usage data, drafting roadmaps — and uses LLMs to speed each of them up.
+
+## Who we're targeting
+
+Product managers at SaaS companies with 50-500 engineers. These PMs are stretched thin, juggle a lot of surface area, and would benefit from AI assistance.
+
+## What we've done so far
+
+- Talked to a few PMs we know from prior jobs
+- Built a prototype that summarizes Zoom customer calls into a PRD stub
+- Got on a waitlist of about 40 signups from LinkedIn posts
--- a/test/fixtures/overlay-nudges.ts
+++ b/test/fixtures/overlay-nudges.ts
@@ -0,0 +1,487 @@
+/**
+ * Overlay-efficacy fixture registry.
+ *
+ * Each fixture defines a reproducible A/B test for one behavioral nudge
+ * embedded in a model-overlays/*.md file. The harness at
+ * test/skill-e2e-overlay-harness.test.ts iterates this registry and runs
+ * `fixture.trials` A/B trials per fixture, asserting `fixture.pass(arms)`.
+ *
+ * Adding a new overlay eval = one entry in this list. The harness handles
+ * arm wiring, concurrency, artifact storage, rate-limit retries, and the
+ * cross-harness diagnostic.
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import {
+  firstTurnParallelism,
+  type AgentSdkResult,
+} from '../helpers/agent-sdk-runner';
+
+const REPO_ROOT = path.resolve(__dirname, '..', '..');
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+export interface OverlayFixture {
+  /** Unique, lowercase/digits/dash only. Used in artifact paths. */
+  id: string;
+  /** Path to the overlay file, relative to repo root. */
+  overlayPath: string;
+  /** API model ID, not the overlay family name. */
+  model: string;
+  /** Integer >= 3. Trials per arm. */
+  trials: number;
+  /** Max concurrent queries for this fixture's arms. Default 3. */
+  concurrency?: number;
+  /** Populate the workspace dir before each trial. */
+  setupWorkspace: (dir: string) => void;
+  /** The prompt the model receives. Non-empty. */
+  userPrompt: string;
+  /** Per-fixture tool allowlist. Omit to use runner default [Read, Glob, Grep, Bash]. */
+  allowedTools?: string[];
+  /** Max turns per trial. Omit to use runner default (5). */
+  maxTurns?: number;
+  /**
+   * Direction of the expected effect. `higher_is_better` = overlay should
+   * increase the metric (e.g. fanout, files touched for literal scope).
+   * `lower_is_better` = overlay should decrease it (e.g. Bash count, turn count).
+   * Used only for cosmetic logging in the test output; `pass` is the actual gate.
+   */
+  direction?: 'higher_is_better' | 'lower_is_better';
+  /** Compute the per-trial metric from the typed SDK result. */
+  metric: (r: AgentSdkResult) => number;
+  /** Acceptance predicate across all arms' per-trial metrics. */
+  pass: (arms: { overlay: number[]; off: number[] }) => boolean;
+}
+
+// ---------------------------------------------------------------------------
+// Validation
+// ---------------------------------------------------------------------------
+
+export function validateFixtures(fixtures: OverlayFixture[]): void {
+  const ids = new Set<string>();
+  for (const f of fixtures) {
+    if (!f.id || !/^[a-z0-9-]+$/.test(f.id)) {
+      throw new Error(
+        `fixture id must be non-empty, lowercase/digits/dash only: ${JSON.stringify(f.id)}`,
+      );
+    }
+    if (ids.has(f.id)) {
+      throw new Error(`duplicate fixture id: ${f.id}`);
+    }
+    ids.add(f.id);
+
+    if (!Number.isInteger(f.trials) || f.trials < 3) {
+      throw new Error(`${f.id}: trials must be an integer >= 3 (got ${f.trials})`);
+    }
+    if (
+      f.concurrency !== undefined &&
+      (!Number.isInteger(f.concurrency) || f.concurrency < 1)
+    ) {
+      throw new Error(
+        `${f.id}: concurrency must be an integer >= 1 (got ${f.concurrency})`,
+      );
+    }
+
+    if (!f.model) throw new Error(`${f.id}: model must be non-empty`);
+    if (!f.userPrompt) throw new Error(`${f.id}: userPrompt must be non-empty`);
+
+    if (path.isAbsolute(f.overlayPath) || f.overlayPath.includes('..')) {
+      throw new Error(
+        `${f.id}: overlayPath must be relative and must not contain '..' (got ${f.overlayPath})`,
+      );
+    }
+    const fullPath = path.resolve(REPO_ROOT, f.overlayPath);
+    if (!fs.existsSync(fullPath)) {
+      throw new Error(`${f.id}: overlay file not found at ${f.overlayPath}`);
+    }
+
+    for (const fn of ['setupWorkspace', 'metric', 'pass'] as const) {
+      if (typeof f[fn] !== 'function') {
+        throw new Error(`${f.id}: ${fn} must be a function`);
+      }
+    }
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Metric + predicate helpers
+// ---------------------------------------------------------------------------
+
+function mean(xs: number[]): number {
+  if (xs.length === 0) return 0;
+  return xs.reduce((a, b) => a + b, 0) / xs.length;
+}
+
+/**
+ * Standard fanout predicate: overlay mean beats off mean by at least 0.5
+ * parallel tool_use blocks in first turn, AND at least 3 of the overlay
+ * trials emit >= 2 parallel tool_use blocks.
+ *
+ * The combined rule catches both "overlay nudges every trial slightly"
+ * (mean) and "overlay sometimes triggers real fanout" (floor). A single
+ * 0.5 lift with every trial still emitting 1 call would be suspicious;
+ * this predicate rejects it.
+ */
+export function fanoutPass(arms: { overlay: number[]; off: number[] }): boolean {
+  const lift = mean(arms.overlay) - mean(arms.off);
+  const floorHits = arms.overlay.filter((n) => n >= 2).length;
+  return lift >= 0.5 && floorHits >= 3;
+}
+
+/**
+ * Generic "lower is better" pass predicate: overlay mean should drop the
+ * metric by at least 20% vs baseline. Used for nudges like "effort-match"
+ * (fewer turns) and "dedicated tools vs Bash" (fewer Bash calls).
+ */
+export function lowerIsBetter20Pct(arms: { overlay: number[]; off: number[] }): boolean {
+  const meanOff = mean(arms.off);
+  if (meanOff === 0) return mean(arms.overlay) <= meanOff;
+  return mean(arms.overlay) <= meanOff * 0.8;
+}
+
+/**
+ * Generic "higher is better" pass predicate: overlay mean should lift the
+ * metric by at least 20% vs baseline. Used for nudges like "literal
+ * interpretation" (more files touched when scope is ambiguous).
+ */
+export function higherIsBetter20Pct(arms: { overlay: number[]; off: number[] }): boolean {
+  const meanOff = mean(arms.off);
+  const meanOn = mean(arms.overlay);
+  if (meanOff === 0) return meanOn > 0;
+  return meanOn >= meanOff * 1.2;
+}
+
+// ---------------------------------------------------------------------------
+// Metrics
+// ---------------------------------------------------------------------------
+
+/**
+ * Count the total number of Bash tool_use blocks across ALL assistant turns.
+ * Signal for "dedicated tools over Bash" nudge in claude.md.
+ */
+export function bashToolCallCount(r: AgentSdkResult): number {
+  return r.toolCalls.filter((c) => c.tool === 'Bash').length;
+}
+
+/**
+ * Total turns the session used to complete. Signal for "effort-match the
+ * step" nudge in opus-4-7.md — trivial prompts should complete quickly.
+ */
+export function turnsToCompletion(r: AgentSdkResult): number {
+  return r.turnsUsed;
+}
+
+/**
+ * Count of unique files the model edited or wrote. Signal for "literal
+ * interpretation" nudge in opus-4-7.md — "fix the tests" with multiple
+ * failures should touch all of them.
+ */
+export function uniqueFilesEdited(r: AgentSdkResult): number {
+  const touched = new Set<string>();
+  for (const call of r.toolCalls) {
+    if (call.tool === 'Edit' || call.tool === 'Write' || call.tool === 'MultiEdit') {
+      const input = call.input as { file_path?: string } | null;
+      if (input?.file_path) touched.add(input.file_path);
+    }
+  }
+  return touched.size;
+}
+
+// ---------------------------------------------------------------------------
+// Fixtures
+// ---------------------------------------------------------------------------
+
+export const OVERLAY_FIXTURES: OverlayFixture[] = [
+  {
+    id: 'opus-4-7-fanout-toy',
+    overlayPath: 'model-overlays/opus-4-7.md',
+    model: 'claude-opus-4-7',
+    trials: 10,
+    concurrency: 3,
+    setupWorkspace: (dir) => {
+      fs.writeFileSync(path.join(dir, 'alpha.txt'), 'Alpha file: used in module A.\n');
+      fs.writeFileSync(path.join(dir, 'beta.txt'), 'Beta file: used in module B.\n');
+      fs.writeFileSync(path.join(dir, 'gamma.txt'), 'Gamma file: used in module C.\n');
+    },
+    userPrompt:
+      'Read alpha.txt, beta.txt, and gamma.txt and summarize each in one line.',
+    metric: (r) => firstTurnParallelism(r.assistantTurns[0]),
+    pass: fanoutPass,
+  },
+  {
+    id: 'opus-4-7-fanout-realistic',
+    overlayPath: 'model-overlays/opus-4-7.md',
+    model: 'claude-opus-4-7',
+    trials: 10,
+    concurrency: 3,
+    setupWorkspace: (dir) => {
+      fs.writeFileSync(
+        path.join(dir, 'app.ts'),
+        "import { config } from './config';\nimport { util } from './src/util';\n\nexport function main() { return config.name + ':' + util(); }\n",
+      );
+      fs.writeFileSync(
+        path.join(dir, 'config.ts'),
+        "export const config = { name: 'demo', version: 1 };\n",
+      );
+      fs.writeFileSync(
+        path.join(dir, 'README.md'),
+        '# demo project\n\nA small demo. Entry: `app.ts`. Config: `config.ts`.\n',
+      );
+      fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
+      fs.writeFileSync(
+        path.join(dir, 'src', 'util.ts'),
+        "export function util() { return 'util-result'; }\n",
+      );
+    },
+    userPrompt:
+      'Audit this project: read app.ts, config.ts, and README.md, and glob for ' +
+      'every .ts file under src/. Summarize what you find in 3 bullet points.',
+    metric: (r) => firstTurnParallelism(r.assistantTurns[0]),
+    pass: fanoutPass,
+  },
+
+  // -------------------------------------------------------------------------
+  // claude.md / "Dedicated tools over Bash"
+  // -------------------------------------------------------------------------
+  {
+    id: 'claude-dedicated-tools-vs-bash',
+    overlayPath: 'model-overlays/claude.md',
+    model: 'claude-opus-4-7',
+    trials: 10,
+    concurrency: 3,
+    direction: 'lower_is_better',
+    // 5 files + summary = needs more than default 5 turns. SDK throws
+    // instead of returning a result when it hits the cap.
+    maxTurns: 15,
+    setupWorkspace: (dir) => {
+      fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
+      fs.writeFileSync(path.join(dir, 'src', 'index.ts'), "export const x = 1;\n");
+      fs.writeFileSync(path.join(dir, 'src', 'util.ts'), "export function util() { return 42; }\n");
+      fs.writeFileSync(path.join(dir, 'src', 'types.ts'), "export type Foo = { a: number };\n");
+      fs.writeFileSync(path.join(dir, 'src', 'config.ts'), "export const c = { n: 'demo' };\n");
+      fs.writeFileSync(path.join(dir, 'src', 'api.ts'), "export async function fetchFoo() { return null; }\n");
+    },
+    userPrompt:
+      "List every TypeScript file under src/ and tell me what each exports. " +
+      "You may use any tools available.",
+    // Metric: total Bash tool_use count across the whole session.
+    // The overlay says "prefer Read/Glob/Grep over cat/find/grep shell."
+    // A model following that should emit Glob + Read, not Bash ls/find/cat.
+    metric: bashToolCallCount,
+    pass: lowerIsBetter20Pct,
+  },
+
+  // -------------------------------------------------------------------------
+  // opus-4-7.md / "Effort-match the step"
+  // -------------------------------------------------------------------------
+  {
+    id: 'opus-4-7-effort-match-trivial',
+    overlayPath: 'model-overlays/opus-4-7.md',
+    model: 'claude-opus-4-7',
+    trials: 10,
+    concurrency: 3,
+    direction: 'lower_is_better',
+    maxTurns: 8,
+    setupWorkspace: (dir) => {
+      fs.writeFileSync(
+        path.join(dir, 'config.json'),
+        '{"name": "demo", "version": "1.0.0"}\n',
+      );
+    },
+    userPrompt: "What's the version in config.json?",
+    // Metric: turns used to reach a result.
+    // The overlay says "simple lookups don't need deep reasoning."
+    // A trivial prompt like this should complete in 2 turns
+    // (Read, respond). Extended deliberation bumps this up.
+    metric: turnsToCompletion,
+    pass: lowerIsBetter20Pct,
+  },
+
+  // -------------------------------------------------------------------------
+  // opus-4-7.md / "Literal interpretation awareness"
+  // -------------------------------------------------------------------------
+  {
+    id: 'opus-4-7-literal-interpretation',
+    overlayPath: 'model-overlays/opus-4-7.md',
+    model: 'claude-opus-4-7',
+    trials: 10,
+    concurrency: 3,
+    direction: 'higher_is_better',
+    // Allow Edit and Write so the model can fix the tests.
+    allowedTools: ['Read', 'Glob', 'Grep', 'Bash', 'Edit', 'Write'],
+    maxTurns: 15,
+    setupWorkspace: (dir) => {
+      // Three failing test files. The "obvious" one is named first; the
+      // other two require the model to enumerate to notice them.
+      fs.writeFileSync(
+        path.join(dir, 'auth.test.ts'),
+        "import { test, expect } from 'bun:test';\n" +
+          "test('auth returns true', () => { expect(1).toBe(2); });\n",
+      );
+      fs.writeFileSync(
+        path.join(dir, 'billing.test.ts'),
+        "import { test, expect } from 'bun:test';\n" +
+          "test('billing computes total', () => { expect('ten').toBe('10'); });\n",
+      );
+      fs.writeFileSync(
+        path.join(dir, 'notifications.test.ts'),
+        "import { test, expect } from 'bun:test';\n" +
+          "test('notifications fire on save', () => { expect([1,2]).toEqual([1,2,3]); });\n",
+      );
+      fs.writeFileSync(
+        path.join(dir, 'README.md'),
+        "# demo\n\nSmall project with three failing test files.\n",
+      );
+    },
+    userPrompt: "Fix the failing tests.",
+    // Metric: number of unique test files the model edited/wrote.
+    // The overlay says "fix ALL failing tests, not just the obvious one."
+    // Overlay-ON should touch all 3 test files. Overlay-OFF might stop
+    // at the first one after making it pass.
+    metric: uniqueFilesEdited,
+    pass: higherIsBetter20Pct,
+  },
+
+  // =========================================================================
+  // Sonnet 4.6 variants of the Opus-4.7 fixtures.
+  //
+  // Rationale: /claude.md + /opus-4-7.md overlays measured as no-op or
+  // counterproductive on Opus 4.7. Before deleting the whole overlay stack,
+  // check whether weaker Claude models (Sonnet, Haiku) benefit from the same
+  // nudges. Same overlays, same prompts, same metrics, different model ID.
+  // Sonnet is ~4x cheaper than Opus so these 5 add ~$3 to a run.
+  // =========================================================================
+
+  {
+    id: 'opus-4-7-fanout-toy-sonnet',
+    overlayPath: 'model-overlays/opus-4-7.md',
+    model: 'claude-sonnet-4-6',
+    trials: 10,
+    concurrency: 3,
+    setupWorkspace: (dir) => {
+      fs.writeFileSync(path.join(dir, 'alpha.txt'), 'Alpha file: used in module A.\n');
+      fs.writeFileSync(path.join(dir, 'beta.txt'), 'Beta file: used in module B.\n');
+      fs.writeFileSync(path.join(dir, 'gamma.txt'), 'Gamma file: used in module C.\n');
+    },
+    userPrompt:
+      'Read alpha.txt, beta.txt, and gamma.txt and summarize each in one line.',
+    metric: (r) => firstTurnParallelism(r.assistantTurns[0]),
+    pass: fanoutPass,
+  },
+
+  {
+    id: 'opus-4-7-fanout-realistic-sonnet',
+    overlayPath: 'model-overlays/opus-4-7.md',
+    model: 'claude-sonnet-4-6',
+    trials: 10,
+    concurrency: 3,
+    setupWorkspace: (dir) => {
+      fs.writeFileSync(
+        path.join(dir, 'app.ts'),
+        "import { config } from './config';\nimport { util } from './src/util';\n\nexport function main() { return config.name + ':' + util(); }\n",
+      );
+      fs.writeFileSync(
+        path.join(dir, 'config.ts'),
+        "export const config = { name: 'demo', version: 1 };\n",
+      );
+      fs.writeFileSync(
+        path.join(dir, 'README.md'),
+        '# demo project\n\nA small demo. Entry: `app.ts`. Config: `config.ts`.\n',
+      );
+      fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
+      fs.writeFileSync(
+        path.join(dir, 'src', 'util.ts'),
+        "export function util() { return 'util-result'; }\n",
+      );
+    },
+    userPrompt:
+      'Audit this project: read app.ts, config.ts, and README.md, and glob for ' +
+      'every .ts file under src/. Summarize what you find in 3 bullet points.',
+    metric: (r) => firstTurnParallelism(r.assistantTurns[0]),
+    pass: fanoutPass,
+  },
+
+  {
+    id: 'claude-dedicated-tools-vs-bash-sonnet',
+    overlayPath: 'model-overlays/claude.md',
+    model: 'claude-sonnet-4-6',
+    trials: 10,
+    concurrency: 3,
+    direction: 'lower_is_better',
+    maxTurns: 15,
+    setupWorkspace: (dir) => {
+      fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
+      fs.writeFileSync(path.join(dir, 'src', 'index.ts'), "export const x = 1;\n");
+      fs.writeFileSync(path.join(dir, 'src', 'util.ts'), "export function util() { return 42; }\n");
+      fs.writeFileSync(path.join(dir, 'src', 'types.ts'), "export type Foo = { a: number };\n");
+      fs.writeFileSync(path.join(dir, 'src', 'config.ts'), "export const c = { n: 'demo' };\n");
+      fs.writeFileSync(path.join(dir, 'src', 'api.ts'), "export async function fetchFoo() { return null; }\n");
+    },
+    userPrompt:
+      "List every TypeScript file under src/ and tell me what each exports. " +
+      "You may use any tools available.",
+    metric: bashToolCallCount,
+    pass: lowerIsBetter20Pct,
+  },
+
+  {
+    id: 'opus-4-7-effort-match-trivial-sonnet',
+    overlayPath: 'model-overlays/opus-4-7.md',
+    model: 'claude-sonnet-4-6',
+    trials: 10,
+    concurrency: 3,
+    direction: 'lower_is_better',
+    maxTurns: 8,
+    setupWorkspace: (dir) => {
+      fs.writeFileSync(
+        path.join(dir, 'config.json'),
+        '{"name": "demo", "version": "1.0.0"}\n',
+      );
+    },
+    userPrompt: "What's the version in config.json?",
+    metric: turnsToCompletion,
+    pass: lowerIsBetter20Pct,
+  },
+
+  {
+    id: 'opus-4-7-literal-interpretation-sonnet',
+    overlayPath: 'model-overlays/opus-4-7.md',
+    model: 'claude-sonnet-4-6',
+    trials: 10,
+    concurrency: 3,
+    direction: 'higher_is_better',
+    allowedTools: ['Read', 'Glob', 'Grep', 'Bash', 'Edit', 'Write'],
+    maxTurns: 15,
+    setupWorkspace: (dir) => {
+      fs.writeFileSync(
+        path.join(dir, 'auth.test.ts'),
+        "import { test, expect } from 'bun:test';\n" +
+          "test('auth returns true', () => { expect(1).toBe(2); });\n",
+      );
+      fs.writeFileSync(
+        path.join(dir, 'billing.test.ts'),
+        "import { test, expect } from 'bun:test';\n" +
+          "test('billing computes total', () => { expect('ten').toBe('10'); });\n",
+      );
+      fs.writeFileSync(
+        path.join(dir, 'notifications.test.ts'),
+        "import { test, expect } from 'bun:test';\n" +
+          "test('notifications fire on save', () => { expect([1,2]).toEqual([1,2,3]); });\n",
+      );
+      fs.writeFileSync(
+        path.join(dir, 'README.md'),
+        "# demo\n\nSmall project with three failing test files.\n",
+      );
+    },
+    userPrompt: "Fix the failing tests.",
+    metric: uniqueFilesEdited,
+    pass: higherIsBetter20Pct,
+  },
+];
+
+// Validate at module load so a broken fixture fails fast at test startup,
+// not mid-run after burning API dollars.
+validateFixtures(OVERLAY_FIXTURES);
--- a/test/fixtures/plans/ui-heavy-feature.md
+++ b/test/fixtures/plans/ui-heavy-feature.md
@@ -0,0 +1,22 @@
+# Plan: User Dashboard Page
+
+## Context
+We're shipping a new user dashboard at `/dashboard` showing recent activity,
+notifications panel, and quick-action buttons. Users land here after login.
+
+## UI Scope
+- New React page component `UserDashboard.tsx` at `src/pages/`
+- Three new sub-components: `ActivityFeed`, `NotificationsPanel`, `QuickActions`
+- Tailwind CSS for layout, mobile-first responsive (breakpoints: sm/md/lg)
+- Empty state, loading skeleton, error state for each panel
+- Hover states + focus-visible outlines on every interactive element
+- Modal dialog for "Mark all as read" on notifications panel
+- Toast notification system for action feedback
+
+## Backend
+- New REST endpoint `GET /api/dashboard` returns `{ activity, notifications, quickActions }`
+- Backed by existing PostgreSQL tables; no schema changes
+
+## Out of scope
+- Dark mode (separate plan)
+- Personalization / customization (separate plan)
--- a/test/fixtures/qa-eval-checkout-ground-truth.json
+++ b/test/fixtures/qa-eval-checkout-ground-truth.json
@@ -0,0 +1,43 @@
+{
+  "fixture": "qa-eval-checkout.html",
+  "bugs": [
+    {
+      "id": "broken-email-regex",
+      "category": "functional",
+      "severity": "high",
+      "description": "Email validation accepts 'user@' as valid — regex pattern [^@]+@[^@] is missing domain requirement",
+      "detection_hint": "email|regex|validation|accepts|invalid|user@|pattern"
+    },
+    {
+      "id": "nan-total",
+      "category": "functional",
+      "severity": "high",
+      "description": "Clearing the quantity field shows 'Total: $NaN' — parseInt on empty string returns NaN with no fallback",
+      "detection_hint": "NaN|total|quantity|empty|price|calculation|clear"
+    },
+    {
+      "id": "cc-field-overflow",
+      "category": "visual",
+      "severity": "medium",
+      "description": "Credit card input has no maxlength attribute — entering >20 characters causes text to overflow the container",
+      "detection_hint": "credit card|maxlength|overflow|cc|input|long|container"
+    },
+    {
+      "id": "missing-required-zip",
+      "category": "functional",
+      "severity": "medium",
+      "description": "Zip code field has no 'required' attribute — form can be submitted without a zip code",
+      "detection_hint": "zip|required|missing|form|submit|shipping|postal"
+    },
+    {
+      "id": "stripe-not-defined",
+      "category": "console",
+      "severity": "high",
+      "description": "Form submit triggers 'Uncaught ReferenceError: stripe is not defined' — payment SDK not loaded",
+      "detection_hint": "stripe|ReferenceError|not defined|console|error|submit|payment"
+    }
+  ],
+  "total_bugs": 5,
+  "minimum_detection": 2,
+  "max_false_positives": 5
+}
--- a/test/fixtures/qa-eval-ground-truth.json
+++ b/test/fixtures/qa-eval-ground-truth.json
@@ -0,0 +1,43 @@
+{
+  "fixture": "qa-eval.html",
+  "bugs": [
+    {
+      "id": "broken-link",
+      "category": "functional",
+      "severity": "medium",
+      "description": "Navigation link 'Resources' points to /nonexistent-404-page which returns 404",
+      "detection_hint": "link|404|broken|dead|nonexistent|Resources"
+    },
+    {
+      "id": "disabled-submit",
+      "category": "functional",
+      "severity": "high",
+      "description": "Contact form submit button has 'disabled' attribute permanently — form can never be submitted",
+      "detection_hint": "disabled|submit|button|form|cannot submit|contact"
+    },
+    {
+      "id": "content-overflow",
+      "category": "visual",
+      "severity": "medium",
+      "description": "Statistics text is clipped by overflow:hidden container — content wider than 400px container",
+      "detection_hint": "overflow|clipped|truncated|hidden|text cut|statistics"
+    },
+    {
+      "id": "missing-alt",
+      "category": "accessibility",
+      "severity": "medium",
+      "description": "Logo image (<img src='/logo.png'>) has no alt attribute",
+      "detection_hint": "alt|accessibility|image|a11y|missing alt|logo"
+    },
+    {
+      "id": "console-error",
+      "category": "console",
+      "severity": "high",
+      "description": "TypeError on page load: Cannot read properties of undefined (reading 'map')",
+      "detection_hint": "console|error|TypeError|undefined|map"
+    }
+  ],
+  "total_bugs": 5,
+  "minimum_detection": 2,
+  "max_false_positives": 5
+}
--- a/test/fixtures/qa-eval-spa-ground-truth.json
+++ b/test/fixtures/qa-eval-spa-ground-truth.json
@@ -0,0 +1,43 @@
+{
+  "fixture": "qa-eval-spa.html",
+  "bugs": [
+    {
+      "id": "broken-route",
+      "category": "functional",
+      "severity": "high",
+      "description": "Products nav link points to #/prodcts (typo) instead of #/products — shows 'Page not found'",
+      "detection_hint": "route|prodcts|typo|products|not found|broken link|navigation"
+    },
+    {
+      "id": "stale-cart-state",
+      "category": "functional",
+      "severity": "medium",
+      "description": "Cart count persists across route changes — never resets when navigating away from products",
+      "detection_hint": "cart|count|state|persist|reset|stale|navigation"
+    },
+    {
+      "id": "async-fetch-error",
+      "category": "functional",
+      "severity": "high",
+      "description": "Product list briefly loads then shows 'Error: Failed to fetch products from API' after 1 second",
+      "detection_hint": "error|fetch|products|API|loading|failed|async"
+    },
+    {
+      "id": "missing-aria-current",
+      "category": "accessibility",
+      "severity": "medium",
+      "description": "Navigation links have no aria-current attribute to indicate the active route",
+      "detection_hint": "aria|current|active|navigation|accessibility|a11y"
+    },
+    {
+      "id": "console-warn-leak",
+      "category": "console",
+      "severity": "medium",
+      "description": "console.warn fires on every route change: 'Possible memory leak detected: 11 event listeners'",
+      "detection_hint": "console|warn|memory leak|listener|event|warning"
+    }
+  ],
+  "total_bugs": 5,
+  "minimum_detection": 2,
+  "max_false_positives": 5
+}
--- a/test/fixtures/review-army-migration.sql
+++ b/test/fixtures/review-army-migration.sql
@@ -0,0 +1,5 @@
+-- Migration: Drop user email column
+-- WARNING: This migration is intentionally unsafe for testing
+ALTER TABLE users DROP COLUMN email;
+ALTER TABLE users DROP COLUMN phone_number;
+-- No backfill, no reversibility check, no data preservation
--- a/test/fixtures/review-army-n-plus-one.rb
+++ b/test/fixtures/review-army-n-plus-one.rb
@@ -0,0 +1,12 @@
+# N+1 query example — intentionally bad for testing
+class PostsController
+  def index
+    @posts = Post.all
+    @posts.each do |post|
+      # N+1: queries Author table for every post
+      puts post.author.name
+      # N+1: queries Comments table for every post
+      puts post.comments.count
+    end
+  end
+end
--- a/test/fixtures/review-eval-design-slop.css
+++ b/test/fixtures/review-eval-design-slop.css
@@ -0,0 +1,86 @@
+/* Planted design anti-patterns for E2E eval — 7 issues */
+
+/* Issue 1: [HIGH] Blacklisted font (Papyrus) */
+/* Issue 2: [HIGH] Body text < 16px (14px) */
+body {
+  font-family: 'Papyrus', sans-serif;
+  font-size: 14px;
+  margin: 0;
+  padding: 0;
+}
+
+/* Issue 5: [MEDIUM] Purple/violet gradient background */
+.hero {
+  background: linear-gradient(135deg, #6366f1, #8b5cf6);
+  text-align: center;
+  padding: 80px 20px;
+  color: white;
+}
+
+.hero h1 {
+  text-align: center;
+  font-size: 48px;
+}
+
+.hero p {
+  text-align: center;
+  font-size: 20px;
+}
+
+/* Issue 7: [LOW] 3-column feature grid with icon circles */
+.features {
+  display: grid;
+  grid-template-columns: repeat(3, 1fr);
+  gap: 24px;
+  padding: 60px 40px;
+  text-align: center;
+}
+
+.feature-card {
+  border-radius: 24px;
+  padding: 32px;
+  text-align: center;
+  background: #f9fafb;
+}
+
+/* Icon in colored circle — AI slop pattern */
+.icon-circle {
+  width: 60px;
+  height: 60px;
+  border-radius: 50%;
+  background: #ede9fe;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  margin: 0 auto 16px;
+  font-size: 24px;
+}
+
+/* Issue 3: [HIGH] outline: none without replacement */
+button {
+  outline: none;
+  background: #6366f1;
+  color: white;
+  border: none;
+  padding: 12px 24px;
+  border-radius: 24px;
+  cursor: pointer;
+}
+
+.small-link {
+  font-size: 11px;
+  padding: 4px 8px;
+}
+
+/* Issue 4: [HIGH] !important usage */
+.override {
+  color: red !important;
+  margin-left: 10px !important;
+}
+
+.footer {
+  text-align: center;
+  padding: 40px;
+  background: #1e1b4b;
+  color: white;
+}
--- a/test/fixtures/review-eval-design-slop.html
+++ b/test/fixtures/review-eval-design-slop.html
@@ -0,0 +1,41 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <link rel="stylesheet" href="styles.css">
+  <title>Our Platform</title>
+</head>
+<body>
+  <!-- Issue 6: [MEDIUM] Generic hero copy ("Welcome to...", "all-in-one solution") -->
+  <div class="hero">
+    <h1>Welcome to Our Platform</h1>
+    <p>Your all-in-one solution for everything you need</p>
+    <button>Get Started</button>
+  </div>
+
+  <!-- Issue 7: [LOW] 3-column feature grid with icon-in-circle + title + description -->
+  <div class="features">
+    <div class="feature-card">
+      <div class="icon-circle">&#9733;</div>
+      <h3>Feature One</h3>
+      <p>A short description of this amazing feature that will change your life.</p>
+    </div>
+    <div class="feature-card">
+      <div class="icon-circle">&#9889;</div>
+      <h3>Feature Two</h3>
+      <p>Another incredible capability that sets us apart from the competition.</p>
+    </div>
+    <div class="feature-card">
+      <div class="icon-circle">&#9881;</div>
+      <h3>Feature Three</h3>
+      <p>Yet another powerful tool to streamline your workflow effortlessly.</p>
+    </div>
+  </div>
+
+  <div class="footer">
+    <p class="override">Unlock the power of our platform today</p>
+    <a href="#" class="small-link">Terms of Service</a>
+  </div>
+</body>
+</html>
--- a/test/fixtures/review-eval-enum-diff.rb
+++ b/test/fixtures/review-eval-enum-diff.rb
@@ -0,0 +1,30 @@
+# Feature branch version: adds "returned" status but misses consumers
+class Order < ApplicationRecord
+  STATUSES = %w[pending processing shipped delivered returned].freeze
+
+  validates :status, inclusion: { in: STATUSES }
+
+  def display_status
+    case status
+    when 'pending'    then 'Awaiting processing'
+    when 'processing' then 'Being prepared'
+    when 'shipped'    then 'On the way'
+    when 'delivered'  then 'Delivered'
+    # BUG: 'returned' not handled — falls through to nil
+    end
+  end
+
+  def can_cancel?
+    # BUG: should 'returned' be cancellable? Not considered.
+    %w[pending processing].include?(status)
+  end
+
+  def notify_customer
+    case status
+    when 'pending'    then OrderMailer.confirmation(self).deliver_later
+    when 'shipped'    then OrderMailer.shipped(self).deliver_later
+    when 'delivered'  then OrderMailer.delivered(self).deliver_later
+    # BUG: 'returned' has no notification — customer won't know return was received
+    end
+  end
+end
--- a/test/fixtures/review-eval-enum.rb
+++ b/test/fixtures/review-eval-enum.rb
@@ -0,0 +1,27 @@
+# Existing file on main: order model with status handling
+class Order < ApplicationRecord
+  STATUSES = %w[pending processing shipped delivered].freeze
+
+  validates :status, inclusion: { in: STATUSES }
+
+  def display_status
+    case status
+    when 'pending'    then 'Awaiting processing'
+    when 'processing' then 'Being prepared'
+    when 'shipped'    then 'On the way'
+    when 'delivered'  then 'Delivered'
+    end
+  end
+
+  def can_cancel?
+    %w[pending processing].include?(status)
+  end
+
+  def notify_customer
+    case status
+    when 'pending'    then OrderMailer.confirmation(self).deliver_later
+    when 'shipped'    then OrderMailer.shipped(self).deliver_later
+    when 'delivered'  then OrderMailer.delivered(self).deliver_later
+    end
+  end
+end
--- a/test/fixtures/review-eval-vuln.rb
+++ b/test/fixtures/review-eval-vuln.rb
@@ -0,0 +1,14 @@
+class UserController < ApplicationController
+  def show
+    # SQL injection — interpolating user input directly into query
+    @user = User.where("id = #{params[:id]}").first
+    render json: @user
+  end
+
+  def promote
+    # Bypasses ActiveRecord validations — update_column skips callbacks + validation
+    @user = User.find(params[:id])
+    @user.update_column(:role, 'admin')
+    head :ok
+  end
+end