Initial import from garrytan/gstack@026751e (main snapshot via local relay)
Some checks failed
Workflow Lint / actionlint (push) Has been cancelled
Build CI Image / build (push) Has been cancelled
Skill Docs Freshness / check-freshness (push) Has been cancelled
Periodic Evals / build-image (push) Has been cancelled
Periodic Evals / evals (map[file:test/codex-e2e.test.ts name:e2e-codex]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/gemini-e2e.test.ts name:e2e-gemini]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-design.test.ts name:e2e-design]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-plan.test.ts name:e2e-plan]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-qa-bugs.test.ts name:e2e-qa-bugs]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-qa-workflow.test.ts name:e2e-qa-workflow]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-review.test.ts name:e2e-review]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-workflow.test.ts name:e2e-workflow]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-routing-e2e.test.ts name:e2e-routing]) (push) Has been cancelled
Some checks failed
Workflow Lint / actionlint (push) Has been cancelled
Build CI Image / build (push) Has been cancelled
Skill Docs Freshness / check-freshness (push) Has been cancelled
Periodic Evals / build-image (push) Has been cancelled
Periodic Evals / evals (map[file:test/codex-e2e.test.ts name:e2e-codex]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/gemini-e2e.test.ts name:e2e-gemini]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-design.test.ts name:e2e-design]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-plan.test.ts name:e2e-plan]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-qa-bugs.test.ts name:e2e-qa-bugs]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-qa-workflow.test.ts name:e2e-qa-workflow]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-review.test.ts name:e2e-review]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-workflow.test.ts name:e2e-workflow]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-routing-e2e.test.ts name:e2e-routing]) (push) Has been cancelled
Source: https://github.com/garrytan/gstack/commit/026751e
This commit is contained in:
345
test/skill-e2e-opus-47.test.ts
Normal file
345
test/skill-e2e-opus-47.test.ts
Normal file
@@ -0,0 +1,345 @@
|
||||
/**
|
||||
* Opus 4.7 behavior evals.
|
||||
*
|
||||
* Two cases, both pinned to claude-opus-4-7:
|
||||
*
|
||||
* 1. Fanout rate — the "Fan out explicitly" overlay nudge should make 4.7
|
||||
* spawn parallel tool calls when the prompt has independent sub-problems.
|
||||
* A/B: SKILL.md regenerated with `--model opus-4-7` (overlay ON) vs
|
||||
* default `--model claude` (overlay OFF). Assert A ≥ B on parallel-call
|
||||
* count in the first assistant turn.
|
||||
*
|
||||
* 2. Routing precision — the new "when in doubt, invoke the skill" policy
|
||||
* should route ambiguous dev prompts to the right skill WITHOUT routing
|
||||
* casual/non-dev prompts. A handful of positive and negative controls.
|
||||
*
|
||||
* Both cases require a running Anthropic API key. Gated behind EVALS=1.
|
||||
* Classify as `periodic` in touchfiles — behavior measurement, not gate.
|
||||
*/
|
||||
|
||||
import { describe, test, expect, afterAll } from 'bun:test';
|
||||
import { runSkillTest } from './helpers/session-runner';
|
||||
import { EvalCollector } from './helpers/eval-store';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const OPUS_47 = 'claude-opus-4-7';
|
||||
|
||||
const evalsEnabled = !!process.env.EVALS;
|
||||
const describeE2E = evalsEnabled ? describe : describe.skip;
|
||||
const evalCollector = evalsEnabled ? new EvalCollector('e2e-opus-47') : null;
|
||||
const runId = new Date().toISOString().replace(/[:.]/g, '').replace('T', '-').slice(0, 15);
|
||||
|
||||
// --- Helpers ---
|
||||
|
||||
/** Skills that must exist as individual .claude/skills/{name}/SKILL.md files
|
||||
* for Claude Code's auto-discovery to treat them as invokable via Skill tool.
|
||||
* Matches the pattern in skill-routing-e2e.test.ts. */
|
||||
const INSTALLED_SKILLS = [
|
||||
'qa', 'qa-only', 'ship', 'review', 'plan-ceo-review', 'plan-eng-review',
|
||||
'plan-design-review', 'design-review', 'design-consultation', 'retro',
|
||||
'document-release', 'investigate', 'office-hours', 'browse',
|
||||
];
|
||||
|
||||
/** Write a scratch root with:
|
||||
* - Per-skill SKILL.md files under .claude/skills/ (so Skill tool sees them)
|
||||
* - Project CLAUDE.md with explicit routing rules AND (optionally) the
|
||||
* 4.7 overlay content directly inlined so `claude -p` sees it
|
||||
* - git init
|
||||
*
|
||||
* `includeOverlay` controls whether the opus-4-7 nudges (Fan out, Literal,
|
||||
* etc.) get inlined into CLAUDE.md — this is the A/B axis for the fanout
|
||||
* test. `claude -p` doesn't auto-load SKILL.md content, so CLAUDE.md is
|
||||
* the only way to make the overlay visible to the model in this test
|
||||
* harness.
|
||||
*/
|
||||
function mkEvalRoot(suffix: string, includeOverlay: boolean): string {
|
||||
const tmp = fs.mkdtempSync(path.join(os.tmpdir(), `opus47-${suffix}-`));
|
||||
|
||||
// Regenerate at opus-4-7 so the per-skill SKILL.md files reflect that
|
||||
// model's overlay. If includeOverlay is false we'll re-regen at default
|
||||
// later just for the root SKILL.md copy. For individual skills, opus-4-7
|
||||
// content doesn't matter for the routing test (we only need discovery).
|
||||
const result = spawnSync(
|
||||
'bun',
|
||||
['run', 'scripts/gen-skill-docs.ts', '--model', includeOverlay ? 'opus-4-7' : 'claude'],
|
||||
{ cwd: ROOT, stdio: 'pipe', encoding: 'utf-8', timeout: 60_000 },
|
||||
);
|
||||
if (result.status !== 0) {
|
||||
throw new Error(`gen-skill-docs failed: ${result.stderr}`);
|
||||
}
|
||||
|
||||
// Install per-skill SKILL.md files for Skill tool discovery.
|
||||
const skillsDir = path.join(tmp, '.claude', 'skills');
|
||||
for (const skill of INSTALLED_SKILLS) {
|
||||
const src = path.join(ROOT, skill, 'SKILL.md');
|
||||
if (!fs.existsSync(src)) continue;
|
||||
const destDir = path.join(skillsDir, skill);
|
||||
fs.mkdirSync(destDir, { recursive: true });
|
||||
fs.copyFileSync(src, path.join(destDir, 'SKILL.md'));
|
||||
}
|
||||
|
||||
// Extract the opus-4-7 model-overlay content from the checked-in file
|
||||
// so we can inline it into CLAUDE.md when includeOverlay is true.
|
||||
const overlayText = includeOverlay
|
||||
? fs.readFileSync(path.join(ROOT, 'model-overlays', 'opus-4-7.md'), 'utf-8')
|
||||
.replace(/\{\{INHERIT:claude\}\}\s*/, '')
|
||||
.trim()
|
||||
: '';
|
||||
|
||||
// Project CLAUDE.md. Explicit routing rules so the agent reaches for
|
||||
// Skill tool on matching prompts, plus the optional overlay.
|
||||
const routingBlock = `## Skill routing
|
||||
|
||||
When the user's request matches an available skill, invoke it via the Skill tool
|
||||
as your FIRST action. The skill has multi-step workflows, checklists, and quality
|
||||
gates that produce better results than an ad-hoc answer. When in doubt, invoke.
|
||||
|
||||
- Bugs, errors, "why is this broken", "wtf" → invoke investigate
|
||||
- Ship, deploy, "send it", create a PR → invoke ship
|
||||
- QA, test the site, "does this work" → invoke qa
|
||||
- Code review, check my diff → invoke review
|
||||
- Product ideas, brainstorming, "is this worth building" → invoke office-hours
|
||||
- Architecture, "does this design make sense" → invoke plan-eng-review
|
||||
- Design system, visual polish → invoke design-review
|
||||
- Weekly retro, what did we ship → invoke retro`;
|
||||
|
||||
const claudeMd = includeOverlay
|
||||
? `# Project\n\n${overlayText}\n\n${routingBlock}\n`
|
||||
: `# Project\n\n${routingBlock}\n`;
|
||||
|
||||
fs.writeFileSync(path.join(tmp, 'CLAUDE.md'), claudeMd);
|
||||
fs.writeFileSync(path.join(tmp, 'package.json'), '{"name":"opus47-eval"}');
|
||||
|
||||
const git = (args: string[]) =>
|
||||
spawnSync('git', args, { cwd: tmp, stdio: 'pipe', timeout: 5_000 });
|
||||
git(['init']);
|
||||
git(['config', 'user.email', 't@t.com']);
|
||||
git(['config', 'user.name', 'T']);
|
||||
git(['add', '.']);
|
||||
git(['commit', '-m', 'init']);
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
/** Count parallel tool calls in the first assistant turn. */
|
||||
function firstTurnParallelism(transcript: any[]): number {
|
||||
const firstAssistant = transcript.find((e) => e.type === 'assistant');
|
||||
if (!firstAssistant) return 0;
|
||||
const content = firstAssistant.message?.content ?? [];
|
||||
return content.filter((c: any) => c.type === 'tool_use').length;
|
||||
}
|
||||
|
||||
interface RoutingCase {
|
||||
name: string;
|
||||
prompt: string;
|
||||
shouldRoute: boolean;
|
||||
expectedSkill?: string;
|
||||
}
|
||||
|
||||
/** Small, intentionally chosen routing cases. Positive cases are ambiguous
|
||||
* phrasings the user actually says, not template text. Negative cases are
|
||||
* casual or off-topic prompts that match routing keywords but shouldn't
|
||||
* trigger a skill. */
|
||||
const ROUTING_CASES: RoutingCase[] = [
|
||||
// Positive — should route
|
||||
{ name: 'pos-wtf-bug', prompt: "wtf is this error coming from auth.ts:47 when the cookie expires?", shouldRoute: true, expectedSkill: 'investigate' },
|
||||
{ name: 'pos-send-it', prompt: "ok this is good enough, let's send it.", shouldRoute: true, expectedSkill: 'ship' },
|
||||
{ name: 'pos-does-it-work', prompt: "I just pushed the login flow changes. Test the deployed site and find any bugs.", shouldRoute: true, expectedSkill: 'qa' },
|
||||
// Negative — should NOT route
|
||||
{ name: 'neg-syntax-q', prompt: "wtf does this Python list comprehension syntax even mean, [x for x in y if z]?", shouldRoute: false },
|
||||
{ name: 'neg-algo-q', prompt: "does this bubble sort algorithm actually work in O(n log n)?", shouldRoute: false },
|
||||
{ name: 'neg-slack-send', prompt: "can you help me write the slack message? I want to send it to the team.", shouldRoute: false },
|
||||
];
|
||||
|
||||
// --- Tests ---
|
||||
|
||||
describeE2E('Opus 4.7 overlay behavior evals', () => {
|
||||
afterAll(() => {
|
||||
evalCollector?.finalize();
|
||||
// Restore working tree: mkEvalRoot runs `gen-skill-docs` with various
|
||||
// --model flags, leaving the in-repo SKILL.md files generated at
|
||||
// whichever model ran last. Reset to the default (claude) so the tree
|
||||
// matches what would be checked in.
|
||||
spawnSync('bun', ['run', 'scripts/gen-skill-docs.ts'], {
|
||||
cwd: ROOT,
|
||||
stdio: 'pipe',
|
||||
timeout: 60_000,
|
||||
});
|
||||
});
|
||||
|
||||
test(
|
||||
'fanout: overlay ON emits >= parallel calls vs overlay OFF on 3-file investigate task',
|
||||
async () => {
|
||||
const armA = mkEvalRoot('on', true);
|
||||
const armB = mkEvalRoot('off', false);
|
||||
|
||||
// Populate three tiny independent files in each arm. The prompt asks
|
||||
// the agent to read all three and report. Opus 4.7 (without nudge)
|
||||
// tends to serialize; with the nudge it should parallelize.
|
||||
for (const dir of [armA, armB]) {
|
||||
fs.writeFileSync(path.join(dir, 'alpha.txt'), 'alpha content: 1\n');
|
||||
fs.writeFileSync(path.join(dir, 'beta.txt'), 'beta content: 2\n');
|
||||
fs.writeFileSync(path.join(dir, 'gamma.txt'), 'gamma content: 3\n');
|
||||
}
|
||||
|
||||
const prompt =
|
||||
"Read alpha.txt, beta.txt, and gamma.txt in this directory and report what's inside each. These three reads are independent.";
|
||||
|
||||
try {
|
||||
const [resA, resB] = await Promise.all([
|
||||
runSkillTest({
|
||||
prompt,
|
||||
workingDirectory: armA,
|
||||
maxTurns: 5,
|
||||
allowedTools: ['Read', 'Bash', 'Glob', 'Grep'],
|
||||
timeout: 90_000,
|
||||
testName: 'fanout-arm-overlay-on',
|
||||
runId,
|
||||
model: OPUS_47,
|
||||
}),
|
||||
runSkillTest({
|
||||
prompt,
|
||||
workingDirectory: armB,
|
||||
maxTurns: 5,
|
||||
allowedTools: ['Read', 'Bash', 'Glob', 'Grep'],
|
||||
timeout: 90_000,
|
||||
testName: 'fanout-arm-overlay-off',
|
||||
runId,
|
||||
model: OPUS_47,
|
||||
}),
|
||||
]);
|
||||
|
||||
const parA = firstTurnParallelism(resA.transcript);
|
||||
const parB = firstTurnParallelism(resB.transcript);
|
||||
|
||||
console.log(
|
||||
`[opus-4-7 fanout] arm A (overlay ON): ${parA} parallel tool calls in first turn; ` +
|
||||
`arm B (overlay OFF): ${parB}`,
|
||||
);
|
||||
console.log(` cost A=$${resA.costEstimate.estimatedCost.toFixed(2)} B=$${resB.costEstimate.estimatedCost.toFixed(2)}`);
|
||||
|
||||
evalCollector?.addTest({
|
||||
name: 'fanout-arm-overlay-on',
|
||||
suite: 'Opus 4.7 overlay',
|
||||
tier: 'e2e',
|
||||
passed: parA >= parB,
|
||||
duration_ms: resA.duration,
|
||||
cost_usd: resA.costEstimate.estimatedCost,
|
||||
transcript: resA.transcript,
|
||||
output: `parallel=${parA}`,
|
||||
turns_used: resA.costEstimate.turnsUsed,
|
||||
exit_reason: resA.exitReason,
|
||||
});
|
||||
evalCollector?.addTest({
|
||||
name: 'fanout-arm-overlay-off',
|
||||
suite: 'Opus 4.7 overlay',
|
||||
tier: 'e2e',
|
||||
passed: true, // baseline arm, recorded for comparison
|
||||
duration_ms: resB.duration,
|
||||
cost_usd: resB.costEstimate.estimatedCost,
|
||||
transcript: resB.transcript,
|
||||
output: `parallel=${parB}`,
|
||||
turns_used: resB.costEstimate.turnsUsed,
|
||||
exit_reason: resB.exitReason,
|
||||
});
|
||||
|
||||
// Main assertion: overlay arm is at least as parallel as baseline.
|
||||
expect(parA, `overlay arm emitted ${parA} parallel calls, baseline ${parB}`).toBeGreaterThanOrEqual(parB);
|
||||
} finally {
|
||||
fs.rmSync(armA, { recursive: true, force: true });
|
||||
fs.rmSync(armB, { recursive: true, force: true });
|
||||
}
|
||||
},
|
||||
240_000,
|
||||
);
|
||||
|
||||
test(
|
||||
'routing precision: positives route, negatives do not',
|
||||
async () => {
|
||||
// Single SKILL.md tree shared by all cases. We run claude-opus-4-7 with
|
||||
// tool access to Skill; measure whether the first tool call is Skill(..)
|
||||
// and if so, which skill.
|
||||
const root = mkEvalRoot('routing', true);
|
||||
|
||||
try {
|
||||
const results = await Promise.all(
|
||||
ROUTING_CASES.map((c) =>
|
||||
runSkillTest({
|
||||
prompt: c.prompt,
|
||||
workingDirectory: root,
|
||||
maxTurns: 3,
|
||||
allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'],
|
||||
timeout: 90_000,
|
||||
testName: `routing-${c.name}`,
|
||||
runId,
|
||||
model: OPUS_47,
|
||||
}).then((r) => ({ c, r })),
|
||||
),
|
||||
);
|
||||
|
||||
let tp = 0, fn = 0, fp = 0, tn = 0;
|
||||
const rows: string[] = [];
|
||||
let totalCost = 0;
|
||||
|
||||
for (const { c, r } of results) {
|
||||
const skillCalls = r.toolCalls.filter((tc) => tc.tool === 'Skill');
|
||||
const routed = skillCalls.length > 0;
|
||||
const actualSkill = routed ? skillCalls[0]?.input?.skill : undefined;
|
||||
|
||||
const correct = c.shouldRoute
|
||||
? routed && (!c.expectedSkill || actualSkill === c.expectedSkill)
|
||||
: !routed;
|
||||
|
||||
if (c.shouldRoute && routed) tp++;
|
||||
else if (c.shouldRoute && !routed) fn++;
|
||||
else if (!c.shouldRoute && routed) fp++;
|
||||
else tn++;
|
||||
|
||||
totalCost += r.costEstimate.estimatedCost;
|
||||
rows.push(
|
||||
` ${c.name.padEnd(18)} routed=${String(routed).padEnd(5)} skill=${String(actualSkill).padEnd(16)} ` +
|
||||
`expected=${c.shouldRoute ? (c.expectedSkill ?? 'any') : '(none)'} ${correct ? 'OK' : 'MISS'}`,
|
||||
);
|
||||
|
||||
evalCollector?.addTest({
|
||||
name: `routing-${c.name}`,
|
||||
suite: 'Opus 4.7 routing',
|
||||
tier: 'e2e',
|
||||
passed: correct,
|
||||
duration_ms: r.duration,
|
||||
cost_usd: r.costEstimate.estimatedCost,
|
||||
transcript: r.transcript,
|
||||
output: `routed=${routed} actual=${actualSkill ?? '(none)'} expected=${c.shouldRoute ? c.expectedSkill ?? 'any' : '(none)'}`,
|
||||
turns_used: r.costEstimate.turnsUsed,
|
||||
exit_reason: r.exitReason,
|
||||
});
|
||||
}
|
||||
|
||||
const posCount = ROUTING_CASES.filter((c) => c.shouldRoute).length;
|
||||
const negCount = ROUTING_CASES.length - posCount;
|
||||
const tpRate = posCount > 0 ? tp / posCount : 0;
|
||||
const fpRate = negCount > 0 ? fp / negCount : 0;
|
||||
|
||||
console.log(`[opus-4-7 routing] total cost $${totalCost.toFixed(2)}`);
|
||||
console.log(rows.join('\n'));
|
||||
console.log(
|
||||
` TP=${tp}/${posCount} (${(tpRate * 100).toFixed(0)}%) FN=${fn} ` +
|
||||
`FP=${fp}/${negCount} (${(fpRate * 100).toFixed(0)}%) TN=${tn}`,
|
||||
);
|
||||
|
||||
// Thresholds from the test plan artifact: TP >= 80%, FP <= 30%.
|
||||
// With a small N we loosen slightly: TP >= 66% (2 of 3 positive),
|
||||
// FP <= 33% (no more than 1 of 3 negatives).
|
||||
expect(tpRate, `true-positive rate ${(tpRate * 100).toFixed(0)}% (need >= 66%)`).toBeGreaterThanOrEqual(2 / 3);
|
||||
expect(fpRate, `false-positive rate ${(fpRate * 100).toFixed(0)}% (need <= 33%)`).toBeLessThanOrEqual(1 / 3);
|
||||
} finally {
|
||||
fs.rmSync(root, { recursive: true, force: true });
|
||||
}
|
||||
},
|
||||
360_000,
|
||||
);
|
||||
});
|
||||
Reference in New Issue
Block a user