Initial import from garrytan/gstack@026751e (main snapshot via local relay)
Some checks failed
Workflow Lint / actionlint (push) Has been cancelled
Build CI Image / build (push) Has been cancelled
Skill Docs Freshness / check-freshness (push) Has been cancelled
Periodic Evals / build-image (push) Has been cancelled
Periodic Evals / evals (map[file:test/codex-e2e.test.ts name:e2e-codex]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/gemini-e2e.test.ts name:e2e-gemini]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-design.test.ts name:e2e-design]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-plan.test.ts name:e2e-plan]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-qa-bugs.test.ts name:e2e-qa-bugs]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-qa-workflow.test.ts name:e2e-qa-workflow]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-review.test.ts name:e2e-review]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-workflow.test.ts name:e2e-workflow]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-routing-e2e.test.ts name:e2e-routing]) (push) Has been cancelled

Source: https://github.com/garrytan/gstack/commit/026751e
This commit is contained in:
Rocky
2026-05-19 21:18:17 +02:00
commit 834c6db075
797 changed files with 267839 additions and 0 deletions

76
test/fixtures/coverage-audit-fixture.ts vendored Normal file
View File

@@ -0,0 +1,76 @@
/**
* Shared fixture for test coverage audit E2E tests.
*
* Creates a Node.js project with billing source code that has intentional
* test coverage gaps: processPayment has happy-path-only tests,
* refundPayment has no tests at all.
*
* Used by: ship-coverage-audit E2E, review-coverage-audit E2E
*/
import * as fs from 'fs';
import * as path from 'path';
import { spawnSync } from 'child_process';
export function createCoverageAuditFixture(dir: string): void {
// Create a Node.js project WITH test framework but coverage gaps
fs.writeFileSync(path.join(dir, 'package.json'), JSON.stringify({
name: 'test-coverage-app',
version: '1.0.0',
type: 'module',
scripts: { test: 'echo "no tests yet"' },
devDependencies: { vitest: '^1.0.0' },
}, null, 2));
// Create vitest config
fs.writeFileSync(path.join(dir, 'vitest.config.ts'),
`import { defineConfig } from 'vitest/config';\nexport default defineConfig({ test: {} });\n`);
fs.writeFileSync(path.join(dir, 'VERSION'), '0.1.0.0\n');
fs.writeFileSync(path.join(dir, 'CHANGELOG.md'), '# Changelog\n');
// Create source file with multiple code paths
fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
fs.writeFileSync(path.join(dir, 'src', 'billing.ts'), `
export function processPayment(amount: number, currency: string) {
if (amount <= 0) throw new Error('Invalid amount');
if (currency !== 'USD' && currency !== 'EUR') throw new Error('Unsupported currency');
return { status: 'success', amount, currency };
}
export function refundPayment(paymentId: string, reason: string) {
if (!paymentId) throw new Error('Payment ID required');
if (!reason) throw new Error('Reason required');
return { status: 'refunded', paymentId, reason };
}
`);
// Create a test directory with ONE test (partial coverage)
fs.mkdirSync(path.join(dir, 'test'), { recursive: true });
fs.writeFileSync(path.join(dir, 'test', 'billing.test.ts'), `
import { describe, test, expect } from 'vitest';
import { processPayment } from '../src/billing';
describe('processPayment', () => {
test('processes valid payment', () => {
const result = processPayment(100, 'USD');
expect(result.status).toBe('success');
});
// GAP: no test for invalid amount
// GAP: no test for unsupported currency
// GAP: refundPayment not tested at all
});
`);
// Init git repo with main branch
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial commit']);
// Create feature branch
run('git', ['checkout', '-b', 'feature/billing']);
}

7
test/fixtures/eval-baselines.json vendored Normal file
View File

@@ -0,0 +1,7 @@
{
"command_reference": { "clarity": 4, "completeness": 3, "actionability": 4 },
"snapshot_flags": { "clarity": 4, "completeness": 4, "actionability": 4 },
"browse_skill": { "clarity": 4, "completeness": 4, "actionability": 4 },
"qa_workflow": { "clarity": 4, "completeness": 4, "actionability": 4 },
"qa_health_rubric": { "clarity": 4, "completeness": 3, "actionability": 4 }
}

122
test/fixtures/forcing-finding-seeds.ts vendored Normal file
View File

@@ -0,0 +1,122 @@
/**
* Per-skill draft-plan seeds engineered to surface at least one
* review-phase finding in the corresponding plan-* review skill.
*
* Used by gate-tier finding-floor tests
* (test/skill-e2e-plan-{eng,ceo,design,devex}-finding-floor.test.ts) as
* the minimum-cost regression for the May 2026 transcript bug:
* "/plan-eng-review reviewed a real PR diff, wrote a multi-section
* review plan to ~/.claude/plans/ and called ExitPlanMode without
* ever firing AskUserQuestion."
*
* Each seed is small and pre-loaded with one obvious finding the
* matching skill cannot honestly miss. Floor tests assert
* `reviewCount >= 1` — i.e., the model fired at least one review-phase
* AUQ before reaching plan_ready / completion_summary / ceiling.
*
* Each seed includes the standard "write your plan-mode plan to /tmp/…"
* preamble that the existing periodic finding-count fixtures use, so
* the agent has a concrete plan-file target. The /tmp path is unique
* per skill to avoid collisions if floor tests run in parallel.
*
* For a deeper [N-1, N+2] count band assertion, see the periodic
* test/skill-e2e-plan-{X}-finding-count.test.ts fixtures.
*/
export const FORCING_FLOOR_ENG = [
'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-eng-floor.md (use Edit/Write to that exact path).',
'',
'# Plan: Add request-id propagation across services',
'',
'## Architecture',
"We'll roll a custom UUIDv7 generator inline in each service rather than",
"use Node's crypto.randomUUID() built-in. Same shape, but we want full",
'control over the entropy source for "future flexibility" — no concrete',
'reason yet.',
].join('\n');
export const FORCING_FLOOR_CEO = [
'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-ceo-floor.md (use Edit/Write to that exact path).',
'',
'# Plan: Launch a "developer-friendly" pricing tier',
'',
'## Goal',
'Increase developer adoption.',
'',
'## Success metric',
'More signups.',
'',
'## Premise',
"We haven't talked to any developers about whether the current pricing",
'is actually a barrier. The team agreed it "feels like" it should be cheaper.',
].join('\n');
export const FORCING_FLOOR_DESIGN = [
'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-design-floor.md (use Edit/Write to that exact path).',
'',
'# Plan: Marketing landing page',
'',
'## Layout',
'All headings, taglines, and body copy will be center-aligned for a',
'"clean modern look." The hero h1 sits 8px above the subhead with no',
'breathing room; the CTA button is the same visual weight as a',
'secondary "Learn more" link directly beside it.',
].join('\n');
export const FORCING_FLOOR_DEVEX = [
'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-devex-floor.md (use Edit/Write to that exact path).',
'',
'# Plan: SDK quickstart docs',
'',
'## Onboarding flow',
'Step 1: clone the repo.',
'Step 2: install bun manually if not present.',
'Step 3: copy .env.example to .env and fill in 8 environment variables.',
'Step 4: run database migrations against your local Postgres.',
'Step 5: start the dev server.',
'Step 6: open the docs in a separate tab.',
'Step 7: register an API key by emailing the team.',
'Step 8: paste the key into your .env, restart the server, then make',
'your first SDK call.',
'',
'No quickstart command, no hosted sandbox, no copy-pasteable curl example.',
].join('\n');
/**
* Multi-finding batching regression seed (periodic tier).
*
* Mirrors the May 2026 transcript bug shape: 4 distinct non-trivial findings
* spread across plan-eng-review's standard sections (Architecture, Code
* Quality, Tests, Performance). Each finding is independent — there is no
* legitimate reason to batch them into a single AskUserQuestion.
*
* Used by test/skill-e2e-plan-eng-multi-finding-batching.test.ts to assert
* the agent fires >= 3 review-phase AUQs (i.e., does NOT batch them into a
* "## Decisions to confirm" section + ExitPlanMode). Floor of 3 (not 4) is
* the [N-1] tolerance from the existing finding-count band convention.
*/
export const FORCING_BATCHING_ENG = [
'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-eng-batching.md (use Edit/Write to that exact path).',
'',
'# Plan: Add background job retry framework',
'',
'## Architecture',
"We'll roll a custom exponential-backoff scheduler inline in each worker",
"rather than use the existing job library's built-in retry hooks. Same",
'shape as the library version, but we want full control over the curve.',
'',
'## Code quality',
'The retry envelope (compute delay, log attempt, dispatch) is duplicated',
'across 5 worker files with copy-pasted bodies. We will leave the',
'duplication for now and refactor "later."',
'',
'## Tests',
'The existing `processWebhookJob()` flow gets rewritten as part of this',
'change. No regression test for the prior at-most-once delivery guarantee',
'is planned.',
'',
'## Performance',
'On every retry we re-fetch the full job payload from the database, then',
'iterate the payload to recompute the dependency graph. Could cache the',
'graph on the first attempt; not planned.',
].join('\n');

2503
test/fixtures/golden-ship-claude.md vendored Normal file

File diff suppressed because it is too large Load Diff

3054
test/fixtures/golden/claude-ship-SKILL.md vendored Normal file

File diff suppressed because it is too large Load Diff

2669
test/fixtures/golden/codex-ship-SKILL.md vendored Normal file

File diff suppressed because it is too large Load Diff

3045
test/fixtures/golden/factory-ship-SKILL.md vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
# Weekend Project: Dependency Graph Visualizer
I want to build a tool that takes a codebase and visualizes its dependency graph — modules, imports, which files depend on which. For fun, for learning. Maybe open-source it.
## What I have so far
- Rough idea: point it at a repo, get an interactive graph
- Stack I'm leaning toward: TypeScript + D3 or Cytoscape for rendering
- Potential: could work for JS/TS first, maybe Python later
## What I don't know yet
- How to make the visualization actually useful vs just pretty
- Whether this should be a CLI, a web tool, or a VS Code extension
- What would make someone else want to use it

View File

@@ -0,0 +1,23 @@
# Plan: Team Velocity Dashboard
## Context
We're building a dashboard for engineering managers to track team code velocity — commits per engineer, PR cycle time, review latency, CI pass rate. The data already lives in GitHub; we're just aggregating it for a manager's single-pane view.
## Changes
1. New React component `TeamVelocityDashboard` in `src/dashboard/`
2. REST API endpoint `GET /api/team/velocity?days=30` returning aggregated metrics
3. Background job pulling GitHub data every 15 minutes into Postgres
4. Simple filter UI: team, date range, metric
## Architecture
- Frontend: React + shadcn/ui
- Backend: Express + PostgreSQL
- Data source: GitHub REST API (cached 15min)
## Open questions
- Should we support multiple repos per team?
- Do we show individual engineer names or aggregate only?

View File

@@ -0,0 +1,13 @@
# Our Idea: AI Tools for Product Managers
We're building AI tools for product managers at mid-market SaaS companies. The product combines a bunch of the things PMs already do — writing PRDs, gathering user feedback, analyzing usage data, drafting roadmaps — and uses LLMs to speed each of them up.
## Who we're targeting
Product managers at SaaS companies with 50-500 engineers. These PMs are stretched thin, juggle a lot of surface area, and would benefit from AI assistance.
## What we've done so far
- Talked to a few PMs we know from prior jobs
- Built a prototype that summarizes Zoom customer calls into a PRD stub
- Got on a waitlist of about 40 signups from LinkedIn posts

487
test/fixtures/overlay-nudges.ts vendored Normal file
View File

@@ -0,0 +1,487 @@
/**
* Overlay-efficacy fixture registry.
*
* Each fixture defines a reproducible A/B test for one behavioral nudge
* embedded in a model-overlays/*.md file. The harness at
* test/skill-e2e-overlay-harness.test.ts iterates this registry and runs
* `fixture.trials` A/B trials per fixture, asserting `fixture.pass(arms)`.
*
* Adding a new overlay eval = one entry in this list. The harness handles
* arm wiring, concurrency, artifact storage, rate-limit retries, and the
* cross-harness diagnostic.
*/
import * as fs from 'fs';
import * as path from 'path';
import {
firstTurnParallelism,
type AgentSdkResult,
} from '../helpers/agent-sdk-runner';
const REPO_ROOT = path.resolve(__dirname, '..', '..');
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
export interface OverlayFixture {
/** Unique, lowercase/digits/dash only. Used in artifact paths. */
id: string;
/** Path to the overlay file, relative to repo root. */
overlayPath: string;
/** API model ID, not the overlay family name. */
model: string;
/** Integer >= 3. Trials per arm. */
trials: number;
/** Max concurrent queries for this fixture's arms. Default 3. */
concurrency?: number;
/** Populate the workspace dir before each trial. */
setupWorkspace: (dir: string) => void;
/** The prompt the model receives. Non-empty. */
userPrompt: string;
/** Per-fixture tool allowlist. Omit to use runner default [Read, Glob, Grep, Bash]. */
allowedTools?: string[];
/** Max turns per trial. Omit to use runner default (5). */
maxTurns?: number;
/**
* Direction of the expected effect. `higher_is_better` = overlay should
* increase the metric (e.g. fanout, files touched for literal scope).
* `lower_is_better` = overlay should decrease it (e.g. Bash count, turn count).
* Used only for cosmetic logging in the test output; `pass` is the actual gate.
*/
direction?: 'higher_is_better' | 'lower_is_better';
/** Compute the per-trial metric from the typed SDK result. */
metric: (r: AgentSdkResult) => number;
/** Acceptance predicate across all arms' per-trial metrics. */
pass: (arms: { overlay: number[]; off: number[] }) => boolean;
}
// ---------------------------------------------------------------------------
// Validation
// ---------------------------------------------------------------------------
export function validateFixtures(fixtures: OverlayFixture[]): void {
const ids = new Set<string>();
for (const f of fixtures) {
if (!f.id || !/^[a-z0-9-]+$/.test(f.id)) {
throw new Error(
`fixture id must be non-empty, lowercase/digits/dash only: ${JSON.stringify(f.id)}`,
);
}
if (ids.has(f.id)) {
throw new Error(`duplicate fixture id: ${f.id}`);
}
ids.add(f.id);
if (!Number.isInteger(f.trials) || f.trials < 3) {
throw new Error(`${f.id}: trials must be an integer >= 3 (got ${f.trials})`);
}
if (
f.concurrency !== undefined &&
(!Number.isInteger(f.concurrency) || f.concurrency < 1)
) {
throw new Error(
`${f.id}: concurrency must be an integer >= 1 (got ${f.concurrency})`,
);
}
if (!f.model) throw new Error(`${f.id}: model must be non-empty`);
if (!f.userPrompt) throw new Error(`${f.id}: userPrompt must be non-empty`);
if (path.isAbsolute(f.overlayPath) || f.overlayPath.includes('..')) {
throw new Error(
`${f.id}: overlayPath must be relative and must not contain '..' (got ${f.overlayPath})`,
);
}
const fullPath = path.resolve(REPO_ROOT, f.overlayPath);
if (!fs.existsSync(fullPath)) {
throw new Error(`${f.id}: overlay file not found at ${f.overlayPath}`);
}
for (const fn of ['setupWorkspace', 'metric', 'pass'] as const) {
if (typeof f[fn] !== 'function') {
throw new Error(`${f.id}: ${fn} must be a function`);
}
}
}
}
// ---------------------------------------------------------------------------
// Metric + predicate helpers
// ---------------------------------------------------------------------------
function mean(xs: number[]): number {
if (xs.length === 0) return 0;
return xs.reduce((a, b) => a + b, 0) / xs.length;
}
/**
* Standard fanout predicate: overlay mean beats off mean by at least 0.5
* parallel tool_use blocks in first turn, AND at least 3 of the overlay
* trials emit >= 2 parallel tool_use blocks.
*
* The combined rule catches both "overlay nudges every trial slightly"
* (mean) and "overlay sometimes triggers real fanout" (floor). A single
* 0.5 lift with every trial still emitting 1 call would be suspicious;
* this predicate rejects it.
*/
export function fanoutPass(arms: { overlay: number[]; off: number[] }): boolean {
const lift = mean(arms.overlay) - mean(arms.off);
const floorHits = arms.overlay.filter((n) => n >= 2).length;
return lift >= 0.5 && floorHits >= 3;
}
/**
* Generic "lower is better" pass predicate: overlay mean should drop the
* metric by at least 20% vs baseline. Used for nudges like "effort-match"
* (fewer turns) and "dedicated tools vs Bash" (fewer Bash calls).
*/
export function lowerIsBetter20Pct(arms: { overlay: number[]; off: number[] }): boolean {
const meanOff = mean(arms.off);
if (meanOff === 0) return mean(arms.overlay) <= meanOff;
return mean(arms.overlay) <= meanOff * 0.8;
}
/**
* Generic "higher is better" pass predicate: overlay mean should lift the
* metric by at least 20% vs baseline. Used for nudges like "literal
* interpretation" (more files touched when scope is ambiguous).
*/
export function higherIsBetter20Pct(arms: { overlay: number[]; off: number[] }): boolean {
const meanOff = mean(arms.off);
const meanOn = mean(arms.overlay);
if (meanOff === 0) return meanOn > 0;
return meanOn >= meanOff * 1.2;
}
// ---------------------------------------------------------------------------
// Metrics
// ---------------------------------------------------------------------------
/**
* Count the total number of Bash tool_use blocks across ALL assistant turns.
* Signal for "dedicated tools over Bash" nudge in claude.md.
*/
export function bashToolCallCount(r: AgentSdkResult): number {
return r.toolCalls.filter((c) => c.tool === 'Bash').length;
}
/**
* Total turns the session used to complete. Signal for "effort-match the
* step" nudge in opus-4-7.md — trivial prompts should complete quickly.
*/
export function turnsToCompletion(r: AgentSdkResult): number {
return r.turnsUsed;
}
/**
* Count of unique files the model edited or wrote. Signal for "literal
* interpretation" nudge in opus-4-7.md — "fix the tests" with multiple
* failures should touch all of them.
*/
export function uniqueFilesEdited(r: AgentSdkResult): number {
const touched = new Set<string>();
for (const call of r.toolCalls) {
if (call.tool === 'Edit' || call.tool === 'Write' || call.tool === 'MultiEdit') {
const input = call.input as { file_path?: string } | null;
if (input?.file_path) touched.add(input.file_path);
}
}
return touched.size;
}
// ---------------------------------------------------------------------------
// Fixtures
// ---------------------------------------------------------------------------
export const OVERLAY_FIXTURES: OverlayFixture[] = [
{
id: 'opus-4-7-fanout-toy',
overlayPath: 'model-overlays/opus-4-7.md',
model: 'claude-opus-4-7',
trials: 10,
concurrency: 3,
setupWorkspace: (dir) => {
fs.writeFileSync(path.join(dir, 'alpha.txt'), 'Alpha file: used in module A.\n');
fs.writeFileSync(path.join(dir, 'beta.txt'), 'Beta file: used in module B.\n');
fs.writeFileSync(path.join(dir, 'gamma.txt'), 'Gamma file: used in module C.\n');
},
userPrompt:
'Read alpha.txt, beta.txt, and gamma.txt and summarize each in one line.',
metric: (r) => firstTurnParallelism(r.assistantTurns[0]),
pass: fanoutPass,
},
{
id: 'opus-4-7-fanout-realistic',
overlayPath: 'model-overlays/opus-4-7.md',
model: 'claude-opus-4-7',
trials: 10,
concurrency: 3,
setupWorkspace: (dir) => {
fs.writeFileSync(
path.join(dir, 'app.ts'),
"import { config } from './config';\nimport { util } from './src/util';\n\nexport function main() { return config.name + ':' + util(); }\n",
);
fs.writeFileSync(
path.join(dir, 'config.ts'),
"export const config = { name: 'demo', version: 1 };\n",
);
fs.writeFileSync(
path.join(dir, 'README.md'),
'# demo project\n\nA small demo. Entry: `app.ts`. Config: `config.ts`.\n',
);
fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
fs.writeFileSync(
path.join(dir, 'src', 'util.ts'),
"export function util() { return 'util-result'; }\n",
);
},
userPrompt:
'Audit this project: read app.ts, config.ts, and README.md, and glob for ' +
'every .ts file under src/. Summarize what you find in 3 bullet points.',
metric: (r) => firstTurnParallelism(r.assistantTurns[0]),
pass: fanoutPass,
},
// -------------------------------------------------------------------------
// claude.md / "Dedicated tools over Bash"
// -------------------------------------------------------------------------
{
id: 'claude-dedicated-tools-vs-bash',
overlayPath: 'model-overlays/claude.md',
model: 'claude-opus-4-7',
trials: 10,
concurrency: 3,
direction: 'lower_is_better',
// 5 files + summary = needs more than default 5 turns. SDK throws
// instead of returning a result when it hits the cap.
maxTurns: 15,
setupWorkspace: (dir) => {
fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
fs.writeFileSync(path.join(dir, 'src', 'index.ts'), "export const x = 1;\n");
fs.writeFileSync(path.join(dir, 'src', 'util.ts'), "export function util() { return 42; }\n");
fs.writeFileSync(path.join(dir, 'src', 'types.ts'), "export type Foo = { a: number };\n");
fs.writeFileSync(path.join(dir, 'src', 'config.ts'), "export const c = { n: 'demo' };\n");
fs.writeFileSync(path.join(dir, 'src', 'api.ts'), "export async function fetchFoo() { return null; }\n");
},
userPrompt:
"List every TypeScript file under src/ and tell me what each exports. " +
"You may use any tools available.",
// Metric: total Bash tool_use count across the whole session.
// The overlay says "prefer Read/Glob/Grep over cat/find/grep shell."
// A model following that should emit Glob + Read, not Bash ls/find/cat.
metric: bashToolCallCount,
pass: lowerIsBetter20Pct,
},
// -------------------------------------------------------------------------
// opus-4-7.md / "Effort-match the step"
// -------------------------------------------------------------------------
{
id: 'opus-4-7-effort-match-trivial',
overlayPath: 'model-overlays/opus-4-7.md',
model: 'claude-opus-4-7',
trials: 10,
concurrency: 3,
direction: 'lower_is_better',
maxTurns: 8,
setupWorkspace: (dir) => {
fs.writeFileSync(
path.join(dir, 'config.json'),
'{"name": "demo", "version": "1.0.0"}\n',
);
},
userPrompt: "What's the version in config.json?",
// Metric: turns used to reach a result.
// The overlay says "simple lookups don't need deep reasoning."
// A trivial prompt like this should complete in 2 turns
// (Read, respond). Extended deliberation bumps this up.
metric: turnsToCompletion,
pass: lowerIsBetter20Pct,
},
// -------------------------------------------------------------------------
// opus-4-7.md / "Literal interpretation awareness"
// -------------------------------------------------------------------------
{
id: 'opus-4-7-literal-interpretation',
overlayPath: 'model-overlays/opus-4-7.md',
model: 'claude-opus-4-7',
trials: 10,
concurrency: 3,
direction: 'higher_is_better',
// Allow Edit and Write so the model can fix the tests.
allowedTools: ['Read', 'Glob', 'Grep', 'Bash', 'Edit', 'Write'],
maxTurns: 15,
setupWorkspace: (dir) => {
// Three failing test files. The "obvious" one is named first; the
// other two require the model to enumerate to notice them.
fs.writeFileSync(
path.join(dir, 'auth.test.ts'),
"import { test, expect } from 'bun:test';\n" +
"test('auth returns true', () => { expect(1).toBe(2); });\n",
);
fs.writeFileSync(
path.join(dir, 'billing.test.ts'),
"import { test, expect } from 'bun:test';\n" +
"test('billing computes total', () => { expect('ten').toBe('10'); });\n",
);
fs.writeFileSync(
path.join(dir, 'notifications.test.ts'),
"import { test, expect } from 'bun:test';\n" +
"test('notifications fire on save', () => { expect([1,2]).toEqual([1,2,3]); });\n",
);
fs.writeFileSync(
path.join(dir, 'README.md'),
"# demo\n\nSmall project with three failing test files.\n",
);
},
userPrompt: "Fix the failing tests.",
// Metric: number of unique test files the model edited/wrote.
// The overlay says "fix ALL failing tests, not just the obvious one."
// Overlay-ON should touch all 3 test files. Overlay-OFF might stop
// at the first one after making it pass.
metric: uniqueFilesEdited,
pass: higherIsBetter20Pct,
},
// =========================================================================
// Sonnet 4.6 variants of the Opus-4.7 fixtures.
//
// Rationale: /claude.md + /opus-4-7.md overlays measured as no-op or
// counterproductive on Opus 4.7. Before deleting the whole overlay stack,
// check whether weaker Claude models (Sonnet, Haiku) benefit from the same
// nudges. Same overlays, same prompts, same metrics, different model ID.
// Sonnet is ~4x cheaper than Opus so these 5 add ~$3 to a run.
// =========================================================================
{
id: 'opus-4-7-fanout-toy-sonnet',
overlayPath: 'model-overlays/opus-4-7.md',
model: 'claude-sonnet-4-6',
trials: 10,
concurrency: 3,
setupWorkspace: (dir) => {
fs.writeFileSync(path.join(dir, 'alpha.txt'), 'Alpha file: used in module A.\n');
fs.writeFileSync(path.join(dir, 'beta.txt'), 'Beta file: used in module B.\n');
fs.writeFileSync(path.join(dir, 'gamma.txt'), 'Gamma file: used in module C.\n');
},
userPrompt:
'Read alpha.txt, beta.txt, and gamma.txt and summarize each in one line.',
metric: (r) => firstTurnParallelism(r.assistantTurns[0]),
pass: fanoutPass,
},
{
id: 'opus-4-7-fanout-realistic-sonnet',
overlayPath: 'model-overlays/opus-4-7.md',
model: 'claude-sonnet-4-6',
trials: 10,
concurrency: 3,
setupWorkspace: (dir) => {
fs.writeFileSync(
path.join(dir, 'app.ts'),
"import { config } from './config';\nimport { util } from './src/util';\n\nexport function main() { return config.name + ':' + util(); }\n",
);
fs.writeFileSync(
path.join(dir, 'config.ts'),
"export const config = { name: 'demo', version: 1 };\n",
);
fs.writeFileSync(
path.join(dir, 'README.md'),
'# demo project\n\nA small demo. Entry: `app.ts`. Config: `config.ts`.\n',
);
fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
fs.writeFileSync(
path.join(dir, 'src', 'util.ts'),
"export function util() { return 'util-result'; }\n",
);
},
userPrompt:
'Audit this project: read app.ts, config.ts, and README.md, and glob for ' +
'every .ts file under src/. Summarize what you find in 3 bullet points.',
metric: (r) => firstTurnParallelism(r.assistantTurns[0]),
pass: fanoutPass,
},
{
id: 'claude-dedicated-tools-vs-bash-sonnet',
overlayPath: 'model-overlays/claude.md',
model: 'claude-sonnet-4-6',
trials: 10,
concurrency: 3,
direction: 'lower_is_better',
maxTurns: 15,
setupWorkspace: (dir) => {
fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
fs.writeFileSync(path.join(dir, 'src', 'index.ts'), "export const x = 1;\n");
fs.writeFileSync(path.join(dir, 'src', 'util.ts'), "export function util() { return 42; }\n");
fs.writeFileSync(path.join(dir, 'src', 'types.ts'), "export type Foo = { a: number };\n");
fs.writeFileSync(path.join(dir, 'src', 'config.ts'), "export const c = { n: 'demo' };\n");
fs.writeFileSync(path.join(dir, 'src', 'api.ts'), "export async function fetchFoo() { return null; }\n");
},
userPrompt:
"List every TypeScript file under src/ and tell me what each exports. " +
"You may use any tools available.",
metric: bashToolCallCount,
pass: lowerIsBetter20Pct,
},
{
id: 'opus-4-7-effort-match-trivial-sonnet',
overlayPath: 'model-overlays/opus-4-7.md',
model: 'claude-sonnet-4-6',
trials: 10,
concurrency: 3,
direction: 'lower_is_better',
maxTurns: 8,
setupWorkspace: (dir) => {
fs.writeFileSync(
path.join(dir, 'config.json'),
'{"name": "demo", "version": "1.0.0"}\n',
);
},
userPrompt: "What's the version in config.json?",
metric: turnsToCompletion,
pass: lowerIsBetter20Pct,
},
{
id: 'opus-4-7-literal-interpretation-sonnet',
overlayPath: 'model-overlays/opus-4-7.md',
model: 'claude-sonnet-4-6',
trials: 10,
concurrency: 3,
direction: 'higher_is_better',
allowedTools: ['Read', 'Glob', 'Grep', 'Bash', 'Edit', 'Write'],
maxTurns: 15,
setupWorkspace: (dir) => {
fs.writeFileSync(
path.join(dir, 'auth.test.ts'),
"import { test, expect } from 'bun:test';\n" +
"test('auth returns true', () => { expect(1).toBe(2); });\n",
);
fs.writeFileSync(
path.join(dir, 'billing.test.ts'),
"import { test, expect } from 'bun:test';\n" +
"test('billing computes total', () => { expect('ten').toBe('10'); });\n",
);
fs.writeFileSync(
path.join(dir, 'notifications.test.ts'),
"import { test, expect } from 'bun:test';\n" +
"test('notifications fire on save', () => { expect([1,2]).toEqual([1,2,3]); });\n",
);
fs.writeFileSync(
path.join(dir, 'README.md'),
"# demo\n\nSmall project with three failing test files.\n",
);
},
userPrompt: "Fix the failing tests.",
metric: uniqueFilesEdited,
pass: higherIsBetter20Pct,
},
];
// Validate at module load so a broken fixture fails fast at test startup,
// not mid-run after burning API dollars.
validateFixtures(OVERLAY_FIXTURES);

22
test/fixtures/plans/ui-heavy-feature.md vendored Normal file
View File

@@ -0,0 +1,22 @@
# Plan: User Dashboard Page
## Context
We're shipping a new user dashboard at `/dashboard` showing recent activity,
notifications panel, and quick-action buttons. Users land here after login.
## UI Scope
- New React page component `UserDashboard.tsx` at `src/pages/`
- Three new sub-components: `ActivityFeed`, `NotificationsPanel`, `QuickActions`
- Tailwind CSS for layout, mobile-first responsive (breakpoints: sm/md/lg)
- Empty state, loading skeleton, error state for each panel
- Hover states + focus-visible outlines on every interactive element
- Modal dialog for "Mark all as read" on notifications panel
- Toast notification system for action feedback
## Backend
- New REST endpoint `GET /api/dashboard` returns `{ activity, notifications, quickActions }`
- Backed by existing PostgreSQL tables; no schema changes
## Out of scope
- Dark mode (separate plan)
- Personalization / customization (separate plan)

View File

@@ -0,0 +1,43 @@
{
"fixture": "qa-eval-checkout.html",
"bugs": [
{
"id": "broken-email-regex",
"category": "functional",
"severity": "high",
"description": "Email validation accepts 'user@' as valid — regex pattern [^@]+@[^@] is missing domain requirement",
"detection_hint": "email|regex|validation|accepts|invalid|user@|pattern"
},
{
"id": "nan-total",
"category": "functional",
"severity": "high",
"description": "Clearing the quantity field shows 'Total: $NaN' — parseInt on empty string returns NaN with no fallback",
"detection_hint": "NaN|total|quantity|empty|price|calculation|clear"
},
{
"id": "cc-field-overflow",
"category": "visual",
"severity": "medium",
"description": "Credit card input has no maxlength attribute — entering >20 characters causes text to overflow the container",
"detection_hint": "credit card|maxlength|overflow|cc|input|long|container"
},
{
"id": "missing-required-zip",
"category": "functional",
"severity": "medium",
"description": "Zip code field has no 'required' attribute — form can be submitted without a zip code",
"detection_hint": "zip|required|missing|form|submit|shipping|postal"
},
{
"id": "stripe-not-defined",
"category": "console",
"severity": "high",
"description": "Form submit triggers 'Uncaught ReferenceError: stripe is not defined' — payment SDK not loaded",
"detection_hint": "stripe|ReferenceError|not defined|console|error|submit|payment"
}
],
"total_bugs": 5,
"minimum_detection": 2,
"max_false_positives": 5
}

43
test/fixtures/qa-eval-ground-truth.json vendored Normal file
View File

@@ -0,0 +1,43 @@
{
"fixture": "qa-eval.html",
"bugs": [
{
"id": "broken-link",
"category": "functional",
"severity": "medium",
"description": "Navigation link 'Resources' points to /nonexistent-404-page which returns 404",
"detection_hint": "link|404|broken|dead|nonexistent|Resources"
},
{
"id": "disabled-submit",
"category": "functional",
"severity": "high",
"description": "Contact form submit button has 'disabled' attribute permanently — form can never be submitted",
"detection_hint": "disabled|submit|button|form|cannot submit|contact"
},
{
"id": "content-overflow",
"category": "visual",
"severity": "medium",
"description": "Statistics text is clipped by overflow:hidden container — content wider than 400px container",
"detection_hint": "overflow|clipped|truncated|hidden|text cut|statistics"
},
{
"id": "missing-alt",
"category": "accessibility",
"severity": "medium",
"description": "Logo image (<img src='/logo.png'>) has no alt attribute",
"detection_hint": "alt|accessibility|image|a11y|missing alt|logo"
},
{
"id": "console-error",
"category": "console",
"severity": "high",
"description": "TypeError on page load: Cannot read properties of undefined (reading 'map')",
"detection_hint": "console|error|TypeError|undefined|map"
}
],
"total_bugs": 5,
"minimum_detection": 2,
"max_false_positives": 5
}

View File

@@ -0,0 +1,43 @@
{
"fixture": "qa-eval-spa.html",
"bugs": [
{
"id": "broken-route",
"category": "functional",
"severity": "high",
"description": "Products nav link points to #/prodcts (typo) instead of #/products — shows 'Page not found'",
"detection_hint": "route|prodcts|typo|products|not found|broken link|navigation"
},
{
"id": "stale-cart-state",
"category": "functional",
"severity": "medium",
"description": "Cart count persists across route changes — never resets when navigating away from products",
"detection_hint": "cart|count|state|persist|reset|stale|navigation"
},
{
"id": "async-fetch-error",
"category": "functional",
"severity": "high",
"description": "Product list briefly loads then shows 'Error: Failed to fetch products from API' after 1 second",
"detection_hint": "error|fetch|products|API|loading|failed|async"
},
{
"id": "missing-aria-current",
"category": "accessibility",
"severity": "medium",
"description": "Navigation links have no aria-current attribute to indicate the active route",
"detection_hint": "aria|current|active|navigation|accessibility|a11y"
},
{
"id": "console-warn-leak",
"category": "console",
"severity": "medium",
"description": "console.warn fires on every route change: 'Possible memory leak detected: 11 event listeners'",
"detection_hint": "console|warn|memory leak|listener|event|warning"
}
],
"total_bugs": 5,
"minimum_detection": 2,
"max_false_positives": 5
}

View File

@@ -0,0 +1,5 @@
-- Migration: Drop user email column
-- WARNING: This migration is intentionally unsafe for testing
ALTER TABLE users DROP COLUMN email;
ALTER TABLE users DROP COLUMN phone_number;
-- No backfill, no reversibility check, no data preservation

12
test/fixtures/review-army-n-plus-one.rb vendored Normal file
View File

@@ -0,0 +1,12 @@
# N+1 query example — intentionally bad for testing
class PostsController
def index
@posts = Post.all
@posts.each do |post|
# N+1: queries Author table for every post
puts post.author.name
# N+1: queries Comments table for every post
puts post.comments.count
end
end
end

View File

@@ -0,0 +1,86 @@
/* Planted design anti-patterns for E2E eval — 7 issues */
/* Issue 1: [HIGH] Blacklisted font (Papyrus) */
/* Issue 2: [HIGH] Body text < 16px (14px) */
body {
font-family: 'Papyrus', sans-serif;
font-size: 14px;
margin: 0;
padding: 0;
}
/* Issue 5: [MEDIUM] Purple/violet gradient background */
.hero {
background: linear-gradient(135deg, #6366f1, #8b5cf6);
text-align: center;
padding: 80px 20px;
color: white;
}
.hero h1 {
text-align: center;
font-size: 48px;
}
.hero p {
text-align: center;
font-size: 20px;
}
/* Issue 7: [LOW] 3-column feature grid with icon circles */
.features {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 24px;
padding: 60px 40px;
text-align: center;
}
.feature-card {
border-radius: 24px;
padding: 32px;
text-align: center;
background: #f9fafb;
}
/* Icon in colored circle — AI slop pattern */
.icon-circle {
width: 60px;
height: 60px;
border-radius: 50%;
background: #ede9fe;
display: flex;
align-items: center;
justify-content: center;
margin: 0 auto 16px;
font-size: 24px;
}
/* Issue 3: [HIGH] outline: none without replacement */
button {
outline: none;
background: #6366f1;
color: white;
border: none;
padding: 12px 24px;
border-radius: 24px;
cursor: pointer;
}
.small-link {
font-size: 11px;
padding: 4px 8px;
}
/* Issue 4: [HIGH] !important usage */
.override {
color: red !important;
margin-left: 10px !important;
}
.footer {
text-align: center;
padding: 40px;
background: #1e1b4b;
color: white;
}

View File

@@ -0,0 +1,41 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link rel="stylesheet" href="styles.css">
<title>Our Platform</title>
</head>
<body>
<!-- Issue 6: [MEDIUM] Generic hero copy ("Welcome to...", "all-in-one solution") -->
<div class="hero">
<h1>Welcome to Our Platform</h1>
<p>Your all-in-one solution for everything you need</p>
<button>Get Started</button>
</div>
<!-- Issue 7: [LOW] 3-column feature grid with icon-in-circle + title + description -->
<div class="features">
<div class="feature-card">
<div class="icon-circle">&#9733;</div>
<h3>Feature One</h3>
<p>A short description of this amazing feature that will change your life.</p>
</div>
<div class="feature-card">
<div class="icon-circle">&#9889;</div>
<h3>Feature Two</h3>
<p>Another incredible capability that sets us apart from the competition.</p>
</div>
<div class="feature-card">
<div class="icon-circle">&#9881;</div>
<h3>Feature Three</h3>
<p>Yet another powerful tool to streamline your workflow effortlessly.</p>
</div>
</div>
<div class="footer">
<p class="override">Unlock the power of our platform today</p>
<a href="#" class="small-link">Terms of Service</a>
</div>
</body>
</html>

30
test/fixtures/review-eval-enum-diff.rb vendored Normal file
View File

@@ -0,0 +1,30 @@
# Feature branch version: adds "returned" status but misses consumers
class Order < ApplicationRecord
STATUSES = %w[pending processing shipped delivered returned].freeze
validates :status, inclusion: { in: STATUSES }
def display_status
case status
when 'pending' then 'Awaiting processing'
when 'processing' then 'Being prepared'
when 'shipped' then 'On the way'
when 'delivered' then 'Delivered'
# BUG: 'returned' not handled — falls through to nil
end
end
def can_cancel?
# BUG: should 'returned' be cancellable? Not considered.
%w[pending processing].include?(status)
end
def notify_customer
case status
when 'pending' then OrderMailer.confirmation(self).deliver_later
when 'shipped' then OrderMailer.shipped(self).deliver_later
when 'delivered' then OrderMailer.delivered(self).deliver_later
# BUG: 'returned' has no notification — customer won't know return was received
end
end
end

27
test/fixtures/review-eval-enum.rb vendored Normal file
View File

@@ -0,0 +1,27 @@
# Existing file on main: order model with status handling
class Order < ApplicationRecord
STATUSES = %w[pending processing shipped delivered].freeze
validates :status, inclusion: { in: STATUSES }
def display_status
case status
when 'pending' then 'Awaiting processing'
when 'processing' then 'Being prepared'
when 'shipped' then 'On the way'
when 'delivered' then 'Delivered'
end
end
def can_cancel?
%w[pending processing].include?(status)
end
def notify_customer
case status
when 'pending' then OrderMailer.confirmation(self).deliver_later
when 'shipped' then OrderMailer.shipped(self).deliver_later
when 'delivered' then OrderMailer.delivered(self).deliver_later
end
end
end

14
test/fixtures/review-eval-vuln.rb vendored Normal file
View File

@@ -0,0 +1,14 @@
class UserController < ApplicationController
def show
# SQL injection — interpolating user input directly into query
@user = User.where("id = #{params[:id]}").first
render json: @user
end
def promote
# Bypasses ActiveRecord validations — update_column skips callbacks + validation
@user = User.find(params[:id])
@user.update_column(:role, 'admin')
head :ok
end
end