Initial import from garrytan/gstack@026751e (main snapshot via local relay)
Some checks failed
Workflow Lint / actionlint (push) Has been cancelled
Build CI Image / build (push) Has been cancelled
Skill Docs Freshness / check-freshness (push) Has been cancelled
Periodic Evals / build-image (push) Has been cancelled
Periodic Evals / evals (map[file:test/codex-e2e.test.ts name:e2e-codex]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/gemini-e2e.test.ts name:e2e-gemini]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-design.test.ts name:e2e-design]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-plan.test.ts name:e2e-plan]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-qa-bugs.test.ts name:e2e-qa-bugs]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-qa-workflow.test.ts name:e2e-qa-workflow]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-review.test.ts name:e2e-review]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-workflow.test.ts name:e2e-workflow]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-routing-e2e.test.ts name:e2e-routing]) (push) Has been cancelled
Some checks failed
Workflow Lint / actionlint (push) Has been cancelled
Build CI Image / build (push) Has been cancelled
Skill Docs Freshness / check-freshness (push) Has been cancelled
Periodic Evals / build-image (push) Has been cancelled
Periodic Evals / evals (map[file:test/codex-e2e.test.ts name:e2e-codex]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/gemini-e2e.test.ts name:e2e-gemini]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-design.test.ts name:e2e-design]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-plan.test.ts name:e2e-plan]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-qa-bugs.test.ts name:e2e-qa-bugs]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-qa-workflow.test.ts name:e2e-qa-workflow]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-review.test.ts name:e2e-review]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-workflow.test.ts name:e2e-workflow]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-routing-e2e.test.ts name:e2e-routing]) (push) Has been cancelled
Source: https://github.com/garrytan/gstack/commit/026751e
This commit is contained in:
76
test/fixtures/coverage-audit-fixture.ts
vendored
Normal file
76
test/fixtures/coverage-audit-fixture.ts
vendored
Normal file
@@ -0,0 +1,76 @@
|
||||
/**
|
||||
* Shared fixture for test coverage audit E2E tests.
|
||||
*
|
||||
* Creates a Node.js project with billing source code that has intentional
|
||||
* test coverage gaps: processPayment has happy-path-only tests,
|
||||
* refundPayment has no tests at all.
|
||||
*
|
||||
* Used by: ship-coverage-audit E2E, review-coverage-audit E2E
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { spawnSync } from 'child_process';
|
||||
|
||||
export function createCoverageAuditFixture(dir: string): void {
|
||||
// Create a Node.js project WITH test framework but coverage gaps
|
||||
fs.writeFileSync(path.join(dir, 'package.json'), JSON.stringify({
|
||||
name: 'test-coverage-app',
|
||||
version: '1.0.0',
|
||||
type: 'module',
|
||||
scripts: { test: 'echo "no tests yet"' },
|
||||
devDependencies: { vitest: '^1.0.0' },
|
||||
}, null, 2));
|
||||
|
||||
// Create vitest config
|
||||
fs.writeFileSync(path.join(dir, 'vitest.config.ts'),
|
||||
`import { defineConfig } from 'vitest/config';\nexport default defineConfig({ test: {} });\n`);
|
||||
|
||||
fs.writeFileSync(path.join(dir, 'VERSION'), '0.1.0.0\n');
|
||||
fs.writeFileSync(path.join(dir, 'CHANGELOG.md'), '# Changelog\n');
|
||||
|
||||
// Create source file with multiple code paths
|
||||
fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
|
||||
fs.writeFileSync(path.join(dir, 'src', 'billing.ts'), `
|
||||
export function processPayment(amount: number, currency: string) {
|
||||
if (amount <= 0) throw new Error('Invalid amount');
|
||||
if (currency !== 'USD' && currency !== 'EUR') throw new Error('Unsupported currency');
|
||||
return { status: 'success', amount, currency };
|
||||
}
|
||||
|
||||
export function refundPayment(paymentId: string, reason: string) {
|
||||
if (!paymentId) throw new Error('Payment ID required');
|
||||
if (!reason) throw new Error('Reason required');
|
||||
return { status: 'refunded', paymentId, reason };
|
||||
}
|
||||
`);
|
||||
|
||||
// Create a test directory with ONE test (partial coverage)
|
||||
fs.mkdirSync(path.join(dir, 'test'), { recursive: true });
|
||||
fs.writeFileSync(path.join(dir, 'test', 'billing.test.ts'), `
|
||||
import { describe, test, expect } from 'vitest';
|
||||
import { processPayment } from '../src/billing';
|
||||
|
||||
describe('processPayment', () => {
|
||||
test('processes valid payment', () => {
|
||||
const result = processPayment(100, 'USD');
|
||||
expect(result.status).toBe('success');
|
||||
});
|
||||
// GAP: no test for invalid amount
|
||||
// GAP: no test for unsupported currency
|
||||
// GAP: refundPayment not tested at all
|
||||
});
|
||||
`);
|
||||
|
||||
// Init git repo with main branch
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial commit']);
|
||||
|
||||
// Create feature branch
|
||||
run('git', ['checkout', '-b', 'feature/billing']);
|
||||
}
|
||||
7
test/fixtures/eval-baselines.json
vendored
Normal file
7
test/fixtures/eval-baselines.json
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"command_reference": { "clarity": 4, "completeness": 3, "actionability": 4 },
|
||||
"snapshot_flags": { "clarity": 4, "completeness": 4, "actionability": 4 },
|
||||
"browse_skill": { "clarity": 4, "completeness": 4, "actionability": 4 },
|
||||
"qa_workflow": { "clarity": 4, "completeness": 4, "actionability": 4 },
|
||||
"qa_health_rubric": { "clarity": 4, "completeness": 3, "actionability": 4 }
|
||||
}
|
||||
122
test/fixtures/forcing-finding-seeds.ts
vendored
Normal file
122
test/fixtures/forcing-finding-seeds.ts
vendored
Normal file
@@ -0,0 +1,122 @@
|
||||
/**
|
||||
* Per-skill draft-plan seeds engineered to surface at least one
|
||||
* review-phase finding in the corresponding plan-* review skill.
|
||||
*
|
||||
* Used by gate-tier finding-floor tests
|
||||
* (test/skill-e2e-plan-{eng,ceo,design,devex}-finding-floor.test.ts) as
|
||||
* the minimum-cost regression for the May 2026 transcript bug:
|
||||
* "/plan-eng-review reviewed a real PR diff, wrote a multi-section
|
||||
* review plan to ~/.claude/plans/ and called ExitPlanMode without
|
||||
* ever firing AskUserQuestion."
|
||||
*
|
||||
* Each seed is small and pre-loaded with one obvious finding the
|
||||
* matching skill cannot honestly miss. Floor tests assert
|
||||
* `reviewCount >= 1` — i.e., the model fired at least one review-phase
|
||||
* AUQ before reaching plan_ready / completion_summary / ceiling.
|
||||
*
|
||||
* Each seed includes the standard "write your plan-mode plan to /tmp/…"
|
||||
* preamble that the existing periodic finding-count fixtures use, so
|
||||
* the agent has a concrete plan-file target. The /tmp path is unique
|
||||
* per skill to avoid collisions if floor tests run in parallel.
|
||||
*
|
||||
* For a deeper [N-1, N+2] count band assertion, see the periodic
|
||||
* test/skill-e2e-plan-{X}-finding-count.test.ts fixtures.
|
||||
*/
|
||||
|
||||
export const FORCING_FLOOR_ENG = [
|
||||
'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-eng-floor.md (use Edit/Write to that exact path).',
|
||||
'',
|
||||
'# Plan: Add request-id propagation across services',
|
||||
'',
|
||||
'## Architecture',
|
||||
"We'll roll a custom UUIDv7 generator inline in each service rather than",
|
||||
"use Node's crypto.randomUUID() built-in. Same shape, but we want full",
|
||||
'control over the entropy source for "future flexibility" — no concrete',
|
||||
'reason yet.',
|
||||
].join('\n');
|
||||
|
||||
export const FORCING_FLOOR_CEO = [
|
||||
'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-ceo-floor.md (use Edit/Write to that exact path).',
|
||||
'',
|
||||
'# Plan: Launch a "developer-friendly" pricing tier',
|
||||
'',
|
||||
'## Goal',
|
||||
'Increase developer adoption.',
|
||||
'',
|
||||
'## Success metric',
|
||||
'More signups.',
|
||||
'',
|
||||
'## Premise',
|
||||
"We haven't talked to any developers about whether the current pricing",
|
||||
'is actually a barrier. The team agreed it "feels like" it should be cheaper.',
|
||||
].join('\n');
|
||||
|
||||
export const FORCING_FLOOR_DESIGN = [
|
||||
'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-design-floor.md (use Edit/Write to that exact path).',
|
||||
'',
|
||||
'# Plan: Marketing landing page',
|
||||
'',
|
||||
'## Layout',
|
||||
'All headings, taglines, and body copy will be center-aligned for a',
|
||||
'"clean modern look." The hero h1 sits 8px above the subhead with no',
|
||||
'breathing room; the CTA button is the same visual weight as a',
|
||||
'secondary "Learn more" link directly beside it.',
|
||||
].join('\n');
|
||||
|
||||
export const FORCING_FLOOR_DEVEX = [
|
||||
'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-devex-floor.md (use Edit/Write to that exact path).',
|
||||
'',
|
||||
'# Plan: SDK quickstart docs',
|
||||
'',
|
||||
'## Onboarding flow',
|
||||
'Step 1: clone the repo.',
|
||||
'Step 2: install bun manually if not present.',
|
||||
'Step 3: copy .env.example to .env and fill in 8 environment variables.',
|
||||
'Step 4: run database migrations against your local Postgres.',
|
||||
'Step 5: start the dev server.',
|
||||
'Step 6: open the docs in a separate tab.',
|
||||
'Step 7: register an API key by emailing the team.',
|
||||
'Step 8: paste the key into your .env, restart the server, then make',
|
||||
'your first SDK call.',
|
||||
'',
|
||||
'No quickstart command, no hosted sandbox, no copy-pasteable curl example.',
|
||||
].join('\n');
|
||||
|
||||
/**
|
||||
* Multi-finding batching regression seed (periodic tier).
|
||||
*
|
||||
* Mirrors the May 2026 transcript bug shape: 4 distinct non-trivial findings
|
||||
* spread across plan-eng-review's standard sections (Architecture, Code
|
||||
* Quality, Tests, Performance). Each finding is independent — there is no
|
||||
* legitimate reason to batch them into a single AskUserQuestion.
|
||||
*
|
||||
* Used by test/skill-e2e-plan-eng-multi-finding-batching.test.ts to assert
|
||||
* the agent fires >= 3 review-phase AUQs (i.e., does NOT batch them into a
|
||||
* "## Decisions to confirm" section + ExitPlanMode). Floor of 3 (not 4) is
|
||||
* the [N-1] tolerance from the existing finding-count band convention.
|
||||
*/
|
||||
export const FORCING_BATCHING_ENG = [
|
||||
'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-eng-batching.md (use Edit/Write to that exact path).',
|
||||
'',
|
||||
'# Plan: Add background job retry framework',
|
||||
'',
|
||||
'## Architecture',
|
||||
"We'll roll a custom exponential-backoff scheduler inline in each worker",
|
||||
"rather than use the existing job library's built-in retry hooks. Same",
|
||||
'shape as the library version, but we want full control over the curve.',
|
||||
'',
|
||||
'## Code quality',
|
||||
'The retry envelope (compute delay, log attempt, dispatch) is duplicated',
|
||||
'across 5 worker files with copy-pasted bodies. We will leave the',
|
||||
'duplication for now and refactor "later."',
|
||||
'',
|
||||
'## Tests',
|
||||
'The existing `processWebhookJob()` flow gets rewritten as part of this',
|
||||
'change. No regression test for the prior at-most-once delivery guarantee',
|
||||
'is planned.',
|
||||
'',
|
||||
'## Performance',
|
||||
'On every retry we re-fetch the full job payload from the database, then',
|
||||
'iterate the payload to recompute the dependency graph. Could cache the',
|
||||
'graph on the first attempt; not planned.',
|
||||
].join('\n');
|
||||
2503
test/fixtures/golden-ship-claude.md
vendored
Normal file
2503
test/fixtures/golden-ship-claude.md
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3054
test/fixtures/golden/claude-ship-SKILL.md
vendored
Normal file
3054
test/fixtures/golden/claude-ship-SKILL.md
vendored
Normal file
File diff suppressed because it is too large
Load Diff
2669
test/fixtures/golden/codex-ship-SKILL.md
vendored
Normal file
2669
test/fixtures/golden/codex-ship-SKILL.md
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3045
test/fixtures/golden/factory-ship-SKILL.md
vendored
Normal file
3045
test/fixtures/golden/factory-ship-SKILL.md
vendored
Normal file
File diff suppressed because it is too large
Load Diff
15
test/fixtures/mode-posture/builder-idea.md
vendored
Normal file
15
test/fixtures/mode-posture/builder-idea.md
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
# Weekend Project: Dependency Graph Visualizer
|
||||
|
||||
I want to build a tool that takes a codebase and visualizes its dependency graph — modules, imports, which files depend on which. For fun, for learning. Maybe open-source it.
|
||||
|
||||
## What I have so far
|
||||
|
||||
- Rough idea: point it at a repo, get an interactive graph
|
||||
- Stack I'm leaning toward: TypeScript + D3 or Cytoscape for rendering
|
||||
- Potential: could work for JS/TS first, maybe Python later
|
||||
|
||||
## What I don't know yet
|
||||
|
||||
- How to make the visualization actually useful vs just pretty
|
||||
- Whether this should be a CLI, a web tool, or a VS Code extension
|
||||
- What would make someone else want to use it
|
||||
23
test/fixtures/mode-posture/expansion-plan.md
vendored
Normal file
23
test/fixtures/mode-posture/expansion-plan.md
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
# Plan: Team Velocity Dashboard
|
||||
|
||||
## Context
|
||||
|
||||
We're building a dashboard for engineering managers to track team code velocity — commits per engineer, PR cycle time, review latency, CI pass rate. The data already lives in GitHub; we're just aggregating it for a manager's single-pane view.
|
||||
|
||||
## Changes
|
||||
|
||||
1. New React component `TeamVelocityDashboard` in `src/dashboard/`
|
||||
2. REST API endpoint `GET /api/team/velocity?days=30` returning aggregated metrics
|
||||
3. Background job pulling GitHub data every 15 minutes into Postgres
|
||||
4. Simple filter UI: team, date range, metric
|
||||
|
||||
## Architecture
|
||||
|
||||
- Frontend: React + shadcn/ui
|
||||
- Backend: Express + PostgreSQL
|
||||
- Data source: GitHub REST API (cached 15min)
|
||||
|
||||
## Open questions
|
||||
|
||||
- Should we support multiple repos per team?
|
||||
- Do we show individual engineer names or aggregate only?
|
||||
13
test/fixtures/mode-posture/forcing-pitch.md
vendored
Normal file
13
test/fixtures/mode-posture/forcing-pitch.md
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
# Our Idea: AI Tools for Product Managers
|
||||
|
||||
We're building AI tools for product managers at mid-market SaaS companies. The product combines a bunch of the things PMs already do — writing PRDs, gathering user feedback, analyzing usage data, drafting roadmaps — and uses LLMs to speed each of them up.
|
||||
|
||||
## Who we're targeting
|
||||
|
||||
Product managers at SaaS companies with 50-500 engineers. These PMs are stretched thin, juggle a lot of surface area, and would benefit from AI assistance.
|
||||
|
||||
## What we've done so far
|
||||
|
||||
- Talked to a few PMs we know from prior jobs
|
||||
- Built a prototype that summarizes Zoom customer calls into a PRD stub
|
||||
- Got on a waitlist of about 40 signups from LinkedIn posts
|
||||
487
test/fixtures/overlay-nudges.ts
vendored
Normal file
487
test/fixtures/overlay-nudges.ts
vendored
Normal file
@@ -0,0 +1,487 @@
|
||||
/**
|
||||
* Overlay-efficacy fixture registry.
|
||||
*
|
||||
* Each fixture defines a reproducible A/B test for one behavioral nudge
|
||||
* embedded in a model-overlays/*.md file. The harness at
|
||||
* test/skill-e2e-overlay-harness.test.ts iterates this registry and runs
|
||||
* `fixture.trials` A/B trials per fixture, asserting `fixture.pass(arms)`.
|
||||
*
|
||||
* Adding a new overlay eval = one entry in this list. The harness handles
|
||||
* arm wiring, concurrency, artifact storage, rate-limit retries, and the
|
||||
* cross-harness diagnostic.
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import {
|
||||
firstTurnParallelism,
|
||||
type AgentSdkResult,
|
||||
} from '../helpers/agent-sdk-runner';
|
||||
|
||||
const REPO_ROOT = path.resolve(__dirname, '..', '..');
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export interface OverlayFixture {
|
||||
/** Unique, lowercase/digits/dash only. Used in artifact paths. */
|
||||
id: string;
|
||||
/** Path to the overlay file, relative to repo root. */
|
||||
overlayPath: string;
|
||||
/** API model ID, not the overlay family name. */
|
||||
model: string;
|
||||
/** Integer >= 3. Trials per arm. */
|
||||
trials: number;
|
||||
/** Max concurrent queries for this fixture's arms. Default 3. */
|
||||
concurrency?: number;
|
||||
/** Populate the workspace dir before each trial. */
|
||||
setupWorkspace: (dir: string) => void;
|
||||
/** The prompt the model receives. Non-empty. */
|
||||
userPrompt: string;
|
||||
/** Per-fixture tool allowlist. Omit to use runner default [Read, Glob, Grep, Bash]. */
|
||||
allowedTools?: string[];
|
||||
/** Max turns per trial. Omit to use runner default (5). */
|
||||
maxTurns?: number;
|
||||
/**
|
||||
* Direction of the expected effect. `higher_is_better` = overlay should
|
||||
* increase the metric (e.g. fanout, files touched for literal scope).
|
||||
* `lower_is_better` = overlay should decrease it (e.g. Bash count, turn count).
|
||||
* Used only for cosmetic logging in the test output; `pass` is the actual gate.
|
||||
*/
|
||||
direction?: 'higher_is_better' | 'lower_is_better';
|
||||
/** Compute the per-trial metric from the typed SDK result. */
|
||||
metric: (r: AgentSdkResult) => number;
|
||||
/** Acceptance predicate across all arms' per-trial metrics. */
|
||||
pass: (arms: { overlay: number[]; off: number[] }) => boolean;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Validation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export function validateFixtures(fixtures: OverlayFixture[]): void {
|
||||
const ids = new Set<string>();
|
||||
for (const f of fixtures) {
|
||||
if (!f.id || !/^[a-z0-9-]+$/.test(f.id)) {
|
||||
throw new Error(
|
||||
`fixture id must be non-empty, lowercase/digits/dash only: ${JSON.stringify(f.id)}`,
|
||||
);
|
||||
}
|
||||
if (ids.has(f.id)) {
|
||||
throw new Error(`duplicate fixture id: ${f.id}`);
|
||||
}
|
||||
ids.add(f.id);
|
||||
|
||||
if (!Number.isInteger(f.trials) || f.trials < 3) {
|
||||
throw new Error(`${f.id}: trials must be an integer >= 3 (got ${f.trials})`);
|
||||
}
|
||||
if (
|
||||
f.concurrency !== undefined &&
|
||||
(!Number.isInteger(f.concurrency) || f.concurrency < 1)
|
||||
) {
|
||||
throw new Error(
|
||||
`${f.id}: concurrency must be an integer >= 1 (got ${f.concurrency})`,
|
||||
);
|
||||
}
|
||||
|
||||
if (!f.model) throw new Error(`${f.id}: model must be non-empty`);
|
||||
if (!f.userPrompt) throw new Error(`${f.id}: userPrompt must be non-empty`);
|
||||
|
||||
if (path.isAbsolute(f.overlayPath) || f.overlayPath.includes('..')) {
|
||||
throw new Error(
|
||||
`${f.id}: overlayPath must be relative and must not contain '..' (got ${f.overlayPath})`,
|
||||
);
|
||||
}
|
||||
const fullPath = path.resolve(REPO_ROOT, f.overlayPath);
|
||||
if (!fs.existsSync(fullPath)) {
|
||||
throw new Error(`${f.id}: overlay file not found at ${f.overlayPath}`);
|
||||
}
|
||||
|
||||
for (const fn of ['setupWorkspace', 'metric', 'pass'] as const) {
|
||||
if (typeof f[fn] !== 'function') {
|
||||
throw new Error(`${f.id}: ${fn} must be a function`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Metric + predicate helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function mean(xs: number[]): number {
|
||||
if (xs.length === 0) return 0;
|
||||
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Standard fanout predicate: overlay mean beats off mean by at least 0.5
|
||||
* parallel tool_use blocks in first turn, AND at least 3 of the overlay
|
||||
* trials emit >= 2 parallel tool_use blocks.
|
||||
*
|
||||
* The combined rule catches both "overlay nudges every trial slightly"
|
||||
* (mean) and "overlay sometimes triggers real fanout" (floor). A single
|
||||
* 0.5 lift with every trial still emitting 1 call would be suspicious;
|
||||
* this predicate rejects it.
|
||||
*/
|
||||
export function fanoutPass(arms: { overlay: number[]; off: number[] }): boolean {
|
||||
const lift = mean(arms.overlay) - mean(arms.off);
|
||||
const floorHits = arms.overlay.filter((n) => n >= 2).length;
|
||||
return lift >= 0.5 && floorHits >= 3;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic "lower is better" pass predicate: overlay mean should drop the
|
||||
* metric by at least 20% vs baseline. Used for nudges like "effort-match"
|
||||
* (fewer turns) and "dedicated tools vs Bash" (fewer Bash calls).
|
||||
*/
|
||||
export function lowerIsBetter20Pct(arms: { overlay: number[]; off: number[] }): boolean {
|
||||
const meanOff = mean(arms.off);
|
||||
if (meanOff === 0) return mean(arms.overlay) <= meanOff;
|
||||
return mean(arms.overlay) <= meanOff * 0.8;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic "higher is better" pass predicate: overlay mean should lift the
|
||||
* metric by at least 20% vs baseline. Used for nudges like "literal
|
||||
* interpretation" (more files touched when scope is ambiguous).
|
||||
*/
|
||||
export function higherIsBetter20Pct(arms: { overlay: number[]; off: number[] }): boolean {
|
||||
const meanOff = mean(arms.off);
|
||||
const meanOn = mean(arms.overlay);
|
||||
if (meanOff === 0) return meanOn > 0;
|
||||
return meanOn >= meanOff * 1.2;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Metrics
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Count the total number of Bash tool_use blocks across ALL assistant turns.
|
||||
* Signal for "dedicated tools over Bash" nudge in claude.md.
|
||||
*/
|
||||
export function bashToolCallCount(r: AgentSdkResult): number {
|
||||
return r.toolCalls.filter((c) => c.tool === 'Bash').length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Total turns the session used to complete. Signal for "effort-match the
|
||||
* step" nudge in opus-4-7.md — trivial prompts should complete quickly.
|
||||
*/
|
||||
export function turnsToCompletion(r: AgentSdkResult): number {
|
||||
return r.turnsUsed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Count of unique files the model edited or wrote. Signal for "literal
|
||||
* interpretation" nudge in opus-4-7.md — "fix the tests" with multiple
|
||||
* failures should touch all of them.
|
||||
*/
|
||||
export function uniqueFilesEdited(r: AgentSdkResult): number {
|
||||
const touched = new Set<string>();
|
||||
for (const call of r.toolCalls) {
|
||||
if (call.tool === 'Edit' || call.tool === 'Write' || call.tool === 'MultiEdit') {
|
||||
const input = call.input as { file_path?: string } | null;
|
||||
if (input?.file_path) touched.add(input.file_path);
|
||||
}
|
||||
}
|
||||
return touched.size;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Fixtures
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const OVERLAY_FIXTURES: OverlayFixture[] = [
|
||||
{
|
||||
id: 'opus-4-7-fanout-toy',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-opus-4-7',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.writeFileSync(path.join(dir, 'alpha.txt'), 'Alpha file: used in module A.\n');
|
||||
fs.writeFileSync(path.join(dir, 'beta.txt'), 'Beta file: used in module B.\n');
|
||||
fs.writeFileSync(path.join(dir, 'gamma.txt'), 'Gamma file: used in module C.\n');
|
||||
},
|
||||
userPrompt:
|
||||
'Read alpha.txt, beta.txt, and gamma.txt and summarize each in one line.',
|
||||
metric: (r) => firstTurnParallelism(r.assistantTurns[0]),
|
||||
pass: fanoutPass,
|
||||
},
|
||||
{
|
||||
id: 'opus-4-7-fanout-realistic',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-opus-4-7',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'app.ts'),
|
||||
"import { config } from './config';\nimport { util } from './src/util';\n\nexport function main() { return config.name + ':' + util(); }\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'config.ts'),
|
||||
"export const config = { name: 'demo', version: 1 };\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'README.md'),
|
||||
'# demo project\n\nA small demo. Entry: `app.ts`. Config: `config.ts`.\n',
|
||||
);
|
||||
fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'src', 'util.ts'),
|
||||
"export function util() { return 'util-result'; }\n",
|
||||
);
|
||||
},
|
||||
userPrompt:
|
||||
'Audit this project: read app.ts, config.ts, and README.md, and glob for ' +
|
||||
'every .ts file under src/. Summarize what you find in 3 bullet points.',
|
||||
metric: (r) => firstTurnParallelism(r.assistantTurns[0]),
|
||||
pass: fanoutPass,
|
||||
},
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// claude.md / "Dedicated tools over Bash"
|
||||
// -------------------------------------------------------------------------
|
||||
{
|
||||
id: 'claude-dedicated-tools-vs-bash',
|
||||
overlayPath: 'model-overlays/claude.md',
|
||||
model: 'claude-opus-4-7',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
direction: 'lower_is_better',
|
||||
// 5 files + summary = needs more than default 5 turns. SDK throws
|
||||
// instead of returning a result when it hits the cap.
|
||||
maxTurns: 15,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
|
||||
fs.writeFileSync(path.join(dir, 'src', 'index.ts'), "export const x = 1;\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'util.ts'), "export function util() { return 42; }\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'types.ts'), "export type Foo = { a: number };\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'config.ts'), "export const c = { n: 'demo' };\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'api.ts'), "export async function fetchFoo() { return null; }\n");
|
||||
},
|
||||
userPrompt:
|
||||
"List every TypeScript file under src/ and tell me what each exports. " +
|
||||
"You may use any tools available.",
|
||||
// Metric: total Bash tool_use count across the whole session.
|
||||
// The overlay says "prefer Read/Glob/Grep over cat/find/grep shell."
|
||||
// A model following that should emit Glob + Read, not Bash ls/find/cat.
|
||||
metric: bashToolCallCount,
|
||||
pass: lowerIsBetter20Pct,
|
||||
},
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// opus-4-7.md / "Effort-match the step"
|
||||
// -------------------------------------------------------------------------
|
||||
{
|
||||
id: 'opus-4-7-effort-match-trivial',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-opus-4-7',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
direction: 'lower_is_better',
|
||||
maxTurns: 8,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'config.json'),
|
||||
'{"name": "demo", "version": "1.0.0"}\n',
|
||||
);
|
||||
},
|
||||
userPrompt: "What's the version in config.json?",
|
||||
// Metric: turns used to reach a result.
|
||||
// The overlay says "simple lookups don't need deep reasoning."
|
||||
// A trivial prompt like this should complete in 2 turns
|
||||
// (Read, respond). Extended deliberation bumps this up.
|
||||
metric: turnsToCompletion,
|
||||
pass: lowerIsBetter20Pct,
|
||||
},
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// opus-4-7.md / "Literal interpretation awareness"
|
||||
// -------------------------------------------------------------------------
|
||||
{
|
||||
id: 'opus-4-7-literal-interpretation',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-opus-4-7',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
direction: 'higher_is_better',
|
||||
// Allow Edit and Write so the model can fix the tests.
|
||||
allowedTools: ['Read', 'Glob', 'Grep', 'Bash', 'Edit', 'Write'],
|
||||
maxTurns: 15,
|
||||
setupWorkspace: (dir) => {
|
||||
// Three failing test files. The "obvious" one is named first; the
|
||||
// other two require the model to enumerate to notice them.
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'auth.test.ts'),
|
||||
"import { test, expect } from 'bun:test';\n" +
|
||||
"test('auth returns true', () => { expect(1).toBe(2); });\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'billing.test.ts'),
|
||||
"import { test, expect } from 'bun:test';\n" +
|
||||
"test('billing computes total', () => { expect('ten').toBe('10'); });\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'notifications.test.ts'),
|
||||
"import { test, expect } from 'bun:test';\n" +
|
||||
"test('notifications fire on save', () => { expect([1,2]).toEqual([1,2,3]); });\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'README.md'),
|
||||
"# demo\n\nSmall project with three failing test files.\n",
|
||||
);
|
||||
},
|
||||
userPrompt: "Fix the failing tests.",
|
||||
// Metric: number of unique test files the model edited/wrote.
|
||||
// The overlay says "fix ALL failing tests, not just the obvious one."
|
||||
// Overlay-ON should touch all 3 test files. Overlay-OFF might stop
|
||||
// at the first one after making it pass.
|
||||
metric: uniqueFilesEdited,
|
||||
pass: higherIsBetter20Pct,
|
||||
},
|
||||
|
||||
// =========================================================================
|
||||
// Sonnet 4.6 variants of the Opus-4.7 fixtures.
|
||||
//
|
||||
// Rationale: /claude.md + /opus-4-7.md overlays measured as no-op or
|
||||
// counterproductive on Opus 4.7. Before deleting the whole overlay stack,
|
||||
// check whether weaker Claude models (Sonnet, Haiku) benefit from the same
|
||||
// nudges. Same overlays, same prompts, same metrics, different model ID.
|
||||
// Sonnet is ~4x cheaper than Opus so these 5 add ~$3 to a run.
|
||||
// =========================================================================
|
||||
|
||||
{
|
||||
id: 'opus-4-7-fanout-toy-sonnet',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-sonnet-4-6',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.writeFileSync(path.join(dir, 'alpha.txt'), 'Alpha file: used in module A.\n');
|
||||
fs.writeFileSync(path.join(dir, 'beta.txt'), 'Beta file: used in module B.\n');
|
||||
fs.writeFileSync(path.join(dir, 'gamma.txt'), 'Gamma file: used in module C.\n');
|
||||
},
|
||||
userPrompt:
|
||||
'Read alpha.txt, beta.txt, and gamma.txt and summarize each in one line.',
|
||||
metric: (r) => firstTurnParallelism(r.assistantTurns[0]),
|
||||
pass: fanoutPass,
|
||||
},
|
||||
|
||||
{
|
||||
id: 'opus-4-7-fanout-realistic-sonnet',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-sonnet-4-6',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'app.ts'),
|
||||
"import { config } from './config';\nimport { util } from './src/util';\n\nexport function main() { return config.name + ':' + util(); }\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'config.ts'),
|
||||
"export const config = { name: 'demo', version: 1 };\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'README.md'),
|
||||
'# demo project\n\nA small demo. Entry: `app.ts`. Config: `config.ts`.\n',
|
||||
);
|
||||
fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'src', 'util.ts'),
|
||||
"export function util() { return 'util-result'; }\n",
|
||||
);
|
||||
},
|
||||
userPrompt:
|
||||
'Audit this project: read app.ts, config.ts, and README.md, and glob for ' +
|
||||
'every .ts file under src/. Summarize what you find in 3 bullet points.',
|
||||
metric: (r) => firstTurnParallelism(r.assistantTurns[0]),
|
||||
pass: fanoutPass,
|
||||
},
|
||||
|
||||
{
|
||||
id: 'claude-dedicated-tools-vs-bash-sonnet',
|
||||
overlayPath: 'model-overlays/claude.md',
|
||||
model: 'claude-sonnet-4-6',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
direction: 'lower_is_better',
|
||||
maxTurns: 15,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
|
||||
fs.writeFileSync(path.join(dir, 'src', 'index.ts'), "export const x = 1;\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'util.ts'), "export function util() { return 42; }\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'types.ts'), "export type Foo = { a: number };\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'config.ts'), "export const c = { n: 'demo' };\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'api.ts'), "export async function fetchFoo() { return null; }\n");
|
||||
},
|
||||
userPrompt:
|
||||
"List every TypeScript file under src/ and tell me what each exports. " +
|
||||
"You may use any tools available.",
|
||||
metric: bashToolCallCount,
|
||||
pass: lowerIsBetter20Pct,
|
||||
},
|
||||
|
||||
{
|
||||
id: 'opus-4-7-effort-match-trivial-sonnet',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-sonnet-4-6',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
direction: 'lower_is_better',
|
||||
maxTurns: 8,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'config.json'),
|
||||
'{"name": "demo", "version": "1.0.0"}\n',
|
||||
);
|
||||
},
|
||||
userPrompt: "What's the version in config.json?",
|
||||
metric: turnsToCompletion,
|
||||
pass: lowerIsBetter20Pct,
|
||||
},
|
||||
|
||||
{
|
||||
id: 'opus-4-7-literal-interpretation-sonnet',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-sonnet-4-6',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
direction: 'higher_is_better',
|
||||
allowedTools: ['Read', 'Glob', 'Grep', 'Bash', 'Edit', 'Write'],
|
||||
maxTurns: 15,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'auth.test.ts'),
|
||||
"import { test, expect } from 'bun:test';\n" +
|
||||
"test('auth returns true', () => { expect(1).toBe(2); });\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'billing.test.ts'),
|
||||
"import { test, expect } from 'bun:test';\n" +
|
||||
"test('billing computes total', () => { expect('ten').toBe('10'); });\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'notifications.test.ts'),
|
||||
"import { test, expect } from 'bun:test';\n" +
|
||||
"test('notifications fire on save', () => { expect([1,2]).toEqual([1,2,3]); });\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'README.md'),
|
||||
"# demo\n\nSmall project with three failing test files.\n",
|
||||
);
|
||||
},
|
||||
userPrompt: "Fix the failing tests.",
|
||||
metric: uniqueFilesEdited,
|
||||
pass: higherIsBetter20Pct,
|
||||
},
|
||||
];
|
||||
|
||||
// Validate at module load so a broken fixture fails fast at test startup,
|
||||
// not mid-run after burning API dollars.
|
||||
validateFixtures(OVERLAY_FIXTURES);
|
||||
22
test/fixtures/plans/ui-heavy-feature.md
vendored
Normal file
22
test/fixtures/plans/ui-heavy-feature.md
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
# Plan: User Dashboard Page
|
||||
|
||||
## Context
|
||||
We're shipping a new user dashboard at `/dashboard` showing recent activity,
|
||||
notifications panel, and quick-action buttons. Users land here after login.
|
||||
|
||||
## UI Scope
|
||||
- New React page component `UserDashboard.tsx` at `src/pages/`
|
||||
- Three new sub-components: `ActivityFeed`, `NotificationsPanel`, `QuickActions`
|
||||
- Tailwind CSS for layout, mobile-first responsive (breakpoints: sm/md/lg)
|
||||
- Empty state, loading skeleton, error state for each panel
|
||||
- Hover states + focus-visible outlines on every interactive element
|
||||
- Modal dialog for "Mark all as read" on notifications panel
|
||||
- Toast notification system for action feedback
|
||||
|
||||
## Backend
|
||||
- New REST endpoint `GET /api/dashboard` returns `{ activity, notifications, quickActions }`
|
||||
- Backed by existing PostgreSQL tables; no schema changes
|
||||
|
||||
## Out of scope
|
||||
- Dark mode (separate plan)
|
||||
- Personalization / customization (separate plan)
|
||||
43
test/fixtures/qa-eval-checkout-ground-truth.json
vendored
Normal file
43
test/fixtures/qa-eval-checkout-ground-truth.json
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
{
|
||||
"fixture": "qa-eval-checkout.html",
|
||||
"bugs": [
|
||||
{
|
||||
"id": "broken-email-regex",
|
||||
"category": "functional",
|
||||
"severity": "high",
|
||||
"description": "Email validation accepts 'user@' as valid — regex pattern [^@]+@[^@] is missing domain requirement",
|
||||
"detection_hint": "email|regex|validation|accepts|invalid|user@|pattern"
|
||||
},
|
||||
{
|
||||
"id": "nan-total",
|
||||
"category": "functional",
|
||||
"severity": "high",
|
||||
"description": "Clearing the quantity field shows 'Total: $NaN' — parseInt on empty string returns NaN with no fallback",
|
||||
"detection_hint": "NaN|total|quantity|empty|price|calculation|clear"
|
||||
},
|
||||
{
|
||||
"id": "cc-field-overflow",
|
||||
"category": "visual",
|
||||
"severity": "medium",
|
||||
"description": "Credit card input has no maxlength attribute — entering >20 characters causes text to overflow the container",
|
||||
"detection_hint": "credit card|maxlength|overflow|cc|input|long|container"
|
||||
},
|
||||
{
|
||||
"id": "missing-required-zip",
|
||||
"category": "functional",
|
||||
"severity": "medium",
|
||||
"description": "Zip code field has no 'required' attribute — form can be submitted without a zip code",
|
||||
"detection_hint": "zip|required|missing|form|submit|shipping|postal"
|
||||
},
|
||||
{
|
||||
"id": "stripe-not-defined",
|
||||
"category": "console",
|
||||
"severity": "high",
|
||||
"description": "Form submit triggers 'Uncaught ReferenceError: stripe is not defined' — payment SDK not loaded",
|
||||
"detection_hint": "stripe|ReferenceError|not defined|console|error|submit|payment"
|
||||
}
|
||||
],
|
||||
"total_bugs": 5,
|
||||
"minimum_detection": 2,
|
||||
"max_false_positives": 5
|
||||
}
|
||||
43
test/fixtures/qa-eval-ground-truth.json
vendored
Normal file
43
test/fixtures/qa-eval-ground-truth.json
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
{
|
||||
"fixture": "qa-eval.html",
|
||||
"bugs": [
|
||||
{
|
||||
"id": "broken-link",
|
||||
"category": "functional",
|
||||
"severity": "medium",
|
||||
"description": "Navigation link 'Resources' points to /nonexistent-404-page which returns 404",
|
||||
"detection_hint": "link|404|broken|dead|nonexistent|Resources"
|
||||
},
|
||||
{
|
||||
"id": "disabled-submit",
|
||||
"category": "functional",
|
||||
"severity": "high",
|
||||
"description": "Contact form submit button has 'disabled' attribute permanently — form can never be submitted",
|
||||
"detection_hint": "disabled|submit|button|form|cannot submit|contact"
|
||||
},
|
||||
{
|
||||
"id": "content-overflow",
|
||||
"category": "visual",
|
||||
"severity": "medium",
|
||||
"description": "Statistics text is clipped by overflow:hidden container — content wider than 400px container",
|
||||
"detection_hint": "overflow|clipped|truncated|hidden|text cut|statistics"
|
||||
},
|
||||
{
|
||||
"id": "missing-alt",
|
||||
"category": "accessibility",
|
||||
"severity": "medium",
|
||||
"description": "Logo image (<img src='/logo.png'>) has no alt attribute",
|
||||
"detection_hint": "alt|accessibility|image|a11y|missing alt|logo"
|
||||
},
|
||||
{
|
||||
"id": "console-error",
|
||||
"category": "console",
|
||||
"severity": "high",
|
||||
"description": "TypeError on page load: Cannot read properties of undefined (reading 'map')",
|
||||
"detection_hint": "console|error|TypeError|undefined|map"
|
||||
}
|
||||
],
|
||||
"total_bugs": 5,
|
||||
"minimum_detection": 2,
|
||||
"max_false_positives": 5
|
||||
}
|
||||
43
test/fixtures/qa-eval-spa-ground-truth.json
vendored
Normal file
43
test/fixtures/qa-eval-spa-ground-truth.json
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
{
|
||||
"fixture": "qa-eval-spa.html",
|
||||
"bugs": [
|
||||
{
|
||||
"id": "broken-route",
|
||||
"category": "functional",
|
||||
"severity": "high",
|
||||
"description": "Products nav link points to #/prodcts (typo) instead of #/products — shows 'Page not found'",
|
||||
"detection_hint": "route|prodcts|typo|products|not found|broken link|navigation"
|
||||
},
|
||||
{
|
||||
"id": "stale-cart-state",
|
||||
"category": "functional",
|
||||
"severity": "medium",
|
||||
"description": "Cart count persists across route changes — never resets when navigating away from products",
|
||||
"detection_hint": "cart|count|state|persist|reset|stale|navigation"
|
||||
},
|
||||
{
|
||||
"id": "async-fetch-error",
|
||||
"category": "functional",
|
||||
"severity": "high",
|
||||
"description": "Product list briefly loads then shows 'Error: Failed to fetch products from API' after 1 second",
|
||||
"detection_hint": "error|fetch|products|API|loading|failed|async"
|
||||
},
|
||||
{
|
||||
"id": "missing-aria-current",
|
||||
"category": "accessibility",
|
||||
"severity": "medium",
|
||||
"description": "Navigation links have no aria-current attribute to indicate the active route",
|
||||
"detection_hint": "aria|current|active|navigation|accessibility|a11y"
|
||||
},
|
||||
{
|
||||
"id": "console-warn-leak",
|
||||
"category": "console",
|
||||
"severity": "medium",
|
||||
"description": "console.warn fires on every route change: 'Possible memory leak detected: 11 event listeners'",
|
||||
"detection_hint": "console|warn|memory leak|listener|event|warning"
|
||||
}
|
||||
],
|
||||
"total_bugs": 5,
|
||||
"minimum_detection": 2,
|
||||
"max_false_positives": 5
|
||||
}
|
||||
5
test/fixtures/review-army-migration.sql
vendored
Normal file
5
test/fixtures/review-army-migration.sql
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
-- Migration: Drop user email column
|
||||
-- WARNING: This migration is intentionally unsafe for testing
|
||||
ALTER TABLE users DROP COLUMN email;
|
||||
ALTER TABLE users DROP COLUMN phone_number;
|
||||
-- No backfill, no reversibility check, no data preservation
|
||||
12
test/fixtures/review-army-n-plus-one.rb
vendored
Normal file
12
test/fixtures/review-army-n-plus-one.rb
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
# N+1 query example — intentionally bad for testing
|
||||
class PostsController
|
||||
def index
|
||||
@posts = Post.all
|
||||
@posts.each do |post|
|
||||
# N+1: queries Author table for every post
|
||||
puts post.author.name
|
||||
# N+1: queries Comments table for every post
|
||||
puts post.comments.count
|
||||
end
|
||||
end
|
||||
end
|
||||
86
test/fixtures/review-eval-design-slop.css
vendored
Normal file
86
test/fixtures/review-eval-design-slop.css
vendored
Normal file
@@ -0,0 +1,86 @@
|
||||
/* Planted design anti-patterns for E2E eval — 7 issues */
|
||||
|
||||
/* Issue 1: [HIGH] Blacklisted font (Papyrus) */
|
||||
/* Issue 2: [HIGH] Body text < 16px (14px) */
|
||||
body {
|
||||
font-family: 'Papyrus', sans-serif;
|
||||
font-size: 14px;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
/* Issue 5: [MEDIUM] Purple/violet gradient background */
|
||||
.hero {
|
||||
background: linear-gradient(135deg, #6366f1, #8b5cf6);
|
||||
text-align: center;
|
||||
padding: 80px 20px;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.hero h1 {
|
||||
text-align: center;
|
||||
font-size: 48px;
|
||||
}
|
||||
|
||||
.hero p {
|
||||
text-align: center;
|
||||
font-size: 20px;
|
||||
}
|
||||
|
||||
/* Issue 7: [LOW] 3-column feature grid with icon circles */
|
||||
.features {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(3, 1fr);
|
||||
gap: 24px;
|
||||
padding: 60px 40px;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.feature-card {
|
||||
border-radius: 24px;
|
||||
padding: 32px;
|
||||
text-align: center;
|
||||
background: #f9fafb;
|
||||
}
|
||||
|
||||
/* Icon in colored circle — AI slop pattern */
|
||||
.icon-circle {
|
||||
width: 60px;
|
||||
height: 60px;
|
||||
border-radius: 50%;
|
||||
background: #ede9fe;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
margin: 0 auto 16px;
|
||||
font-size: 24px;
|
||||
}
|
||||
|
||||
/* Issue 3: [HIGH] outline: none without replacement */
|
||||
button {
|
||||
outline: none;
|
||||
background: #6366f1;
|
||||
color: white;
|
||||
border: none;
|
||||
padding: 12px 24px;
|
||||
border-radius: 24px;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.small-link {
|
||||
font-size: 11px;
|
||||
padding: 4px 8px;
|
||||
}
|
||||
|
||||
/* Issue 4: [HIGH] !important usage */
|
||||
.override {
|
||||
color: red !important;
|
||||
margin-left: 10px !important;
|
||||
}
|
||||
|
||||
.footer {
|
||||
text-align: center;
|
||||
padding: 40px;
|
||||
background: #1e1b4b;
|
||||
color: white;
|
||||
}
|
||||
41
test/fixtures/review-eval-design-slop.html
vendored
Normal file
41
test/fixtures/review-eval-design-slop.html
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<link rel="stylesheet" href="styles.css">
|
||||
<title>Our Platform</title>
|
||||
</head>
|
||||
<body>
|
||||
<!-- Issue 6: [MEDIUM] Generic hero copy ("Welcome to...", "all-in-one solution") -->
|
||||
<div class="hero">
|
||||
<h1>Welcome to Our Platform</h1>
|
||||
<p>Your all-in-one solution for everything you need</p>
|
||||
<button>Get Started</button>
|
||||
</div>
|
||||
|
||||
<!-- Issue 7: [LOW] 3-column feature grid with icon-in-circle + title + description -->
|
||||
<div class="features">
|
||||
<div class="feature-card">
|
||||
<div class="icon-circle">★</div>
|
||||
<h3>Feature One</h3>
|
||||
<p>A short description of this amazing feature that will change your life.</p>
|
||||
</div>
|
||||
<div class="feature-card">
|
||||
<div class="icon-circle">⚡</div>
|
||||
<h3>Feature Two</h3>
|
||||
<p>Another incredible capability that sets us apart from the competition.</p>
|
||||
</div>
|
||||
<div class="feature-card">
|
||||
<div class="icon-circle">⚙</div>
|
||||
<h3>Feature Three</h3>
|
||||
<p>Yet another powerful tool to streamline your workflow effortlessly.</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="footer">
|
||||
<p class="override">Unlock the power of our platform today</p>
|
||||
<a href="#" class="small-link">Terms of Service</a>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
30
test/fixtures/review-eval-enum-diff.rb
vendored
Normal file
30
test/fixtures/review-eval-enum-diff.rb
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
# Feature branch version: adds "returned" status but misses consumers
|
||||
class Order < ApplicationRecord
|
||||
STATUSES = %w[pending processing shipped delivered returned].freeze
|
||||
|
||||
validates :status, inclusion: { in: STATUSES }
|
||||
|
||||
def display_status
|
||||
case status
|
||||
when 'pending' then 'Awaiting processing'
|
||||
when 'processing' then 'Being prepared'
|
||||
when 'shipped' then 'On the way'
|
||||
when 'delivered' then 'Delivered'
|
||||
# BUG: 'returned' not handled — falls through to nil
|
||||
end
|
||||
end
|
||||
|
||||
def can_cancel?
|
||||
# BUG: should 'returned' be cancellable? Not considered.
|
||||
%w[pending processing].include?(status)
|
||||
end
|
||||
|
||||
def notify_customer
|
||||
case status
|
||||
when 'pending' then OrderMailer.confirmation(self).deliver_later
|
||||
when 'shipped' then OrderMailer.shipped(self).deliver_later
|
||||
when 'delivered' then OrderMailer.delivered(self).deliver_later
|
||||
# BUG: 'returned' has no notification — customer won't know return was received
|
||||
end
|
||||
end
|
||||
end
|
||||
27
test/fixtures/review-eval-enum.rb
vendored
Normal file
27
test/fixtures/review-eval-enum.rb
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
# Existing file on main: order model with status handling
|
||||
class Order < ApplicationRecord
|
||||
STATUSES = %w[pending processing shipped delivered].freeze
|
||||
|
||||
validates :status, inclusion: { in: STATUSES }
|
||||
|
||||
def display_status
|
||||
case status
|
||||
when 'pending' then 'Awaiting processing'
|
||||
when 'processing' then 'Being prepared'
|
||||
when 'shipped' then 'On the way'
|
||||
when 'delivered' then 'Delivered'
|
||||
end
|
||||
end
|
||||
|
||||
def can_cancel?
|
||||
%w[pending processing].include?(status)
|
||||
end
|
||||
|
||||
def notify_customer
|
||||
case status
|
||||
when 'pending' then OrderMailer.confirmation(self).deliver_later
|
||||
when 'shipped' then OrderMailer.shipped(self).deliver_later
|
||||
when 'delivered' then OrderMailer.delivered(self).deliver_later
|
||||
end
|
||||
end
|
||||
end
|
||||
14
test/fixtures/review-eval-vuln.rb
vendored
Normal file
14
test/fixtures/review-eval-vuln.rb
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
class UserController < ApplicationController
|
||||
def show
|
||||
# SQL injection — interpolating user input directly into query
|
||||
@user = User.where("id = #{params[:id]}").first
|
||||
render json: @user
|
||||
end
|
||||
|
||||
def promote
|
||||
# Bypasses ActiveRecord validations — update_column skips callbacks + validation
|
||||
@user = User.find(params[:id])
|
||||
@user.update_column(:role, 'admin')
|
||||
head :ok
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user