Initial import from garrytan/gstack@026751e (main snapshot via local relay)
Some checks failed
Workflow Lint / actionlint (push) Has been cancelled
Build CI Image / build (push) Has been cancelled
Skill Docs Freshness / check-freshness (push) Has been cancelled
Periodic Evals / build-image (push) Has been cancelled
Periodic Evals / evals (map[file:test/codex-e2e.test.ts name:e2e-codex]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/gemini-e2e.test.ts name:e2e-gemini]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-design.test.ts name:e2e-design]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-plan.test.ts name:e2e-plan]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-qa-bugs.test.ts name:e2e-qa-bugs]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-qa-workflow.test.ts name:e2e-qa-workflow]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-review.test.ts name:e2e-review]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-workflow.test.ts name:e2e-workflow]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-routing-e2e.test.ts name:e2e-routing]) (push) Has been cancelled
Some checks failed
Workflow Lint / actionlint (push) Has been cancelled
Build CI Image / build (push) Has been cancelled
Skill Docs Freshness / check-freshness (push) Has been cancelled
Periodic Evals / build-image (push) Has been cancelled
Periodic Evals / evals (map[file:test/codex-e2e.test.ts name:e2e-codex]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/gemini-e2e.test.ts name:e2e-gemini]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-design.test.ts name:e2e-design]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-plan.test.ts name:e2e-plan]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-qa-bugs.test.ts name:e2e-qa-bugs]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-qa-workflow.test.ts name:e2e-qa-workflow]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-review.test.ts name:e2e-review]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-workflow.test.ts name:e2e-workflow]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-routing-e2e.test.ts name:e2e-routing]) (push) Has been cancelled
Source: https://github.com/garrytan/gstack/commit/026751e
This commit is contained in:
137
test/benchmark-runner.test.ts
Normal file
137
test/benchmark-runner.test.ts
Normal file
@@ -0,0 +1,137 @@
|
||||
/**
|
||||
* Unit tests for the benchmark runner.
|
||||
*
|
||||
* Mocks adapters to verify:
|
||||
* - All adapters run in parallel (Promise.allSettled not serial)
|
||||
* - Unavailable adapters are skipped or marked depending on flag
|
||||
* - Per-adapter errors don't abort the batch
|
||||
* - Output formatters (table, json, markdown) produce non-empty strings
|
||||
*
|
||||
* Does NOT exercise live CLIs — see test/providers.e2e.test.ts for those.
|
||||
*/
|
||||
|
||||
import { test, expect } from 'bun:test';
|
||||
import { formatTable, formatJson, formatMarkdown, type BenchmarkReport } from './helpers/benchmark-runner';
|
||||
import { estimateCostUsd, PRICING } from './helpers/pricing';
|
||||
import { missingTools, TOOL_COMPATIBILITY } from './helpers/tool-map';
|
||||
|
||||
test('estimateCostUsd returns 0 for unknown model (no crash)', () => {
|
||||
const cost = estimateCostUsd({ input: 1000, output: 500 }, 'unknown-model-7b');
|
||||
expect(cost).toBe(0);
|
||||
});
|
||||
|
||||
test('estimateCostUsd computes correctly for known Claude model', () => {
|
||||
// claude-opus-4-7: $15/MTok input, $75/MTok output
|
||||
// 1M input + 0.5M output = $15 + $37.50 = $52.50
|
||||
const cost = estimateCostUsd({ input: 1_000_000, output: 500_000 }, 'claude-opus-4-7');
|
||||
expect(cost).toBeCloseTo(52.50, 2);
|
||||
});
|
||||
|
||||
test('estimateCostUsd applies cached input discount alongside uncached input', () => {
|
||||
// tokens.input is uncached-only; tokens.cached is disjoint cache-reads at 10%.
|
||||
// 0 uncached input, 1M cached → 10% of 15 = $1.50
|
||||
const cost1 = estimateCostUsd({ input: 0, output: 0, cached: 1_000_000 }, 'claude-opus-4-7');
|
||||
expect(cost1).toBeCloseTo(1.50, 2);
|
||||
// 500K uncached input + 500K cached → $7.50 + $0.75 = $8.25
|
||||
const cost2 = estimateCostUsd({ input: 500_000, output: 0, cached: 500_000 }, 'claude-opus-4-7');
|
||||
expect(cost2).toBeCloseTo(8.25, 2);
|
||||
});
|
||||
|
||||
test('PRICING table covers the key model families', () => {
|
||||
expect(PRICING['claude-opus-4-7']).toBeDefined();
|
||||
expect(PRICING['claude-sonnet-4-6']).toBeDefined();
|
||||
expect(PRICING['gpt-5.4']).toBeDefined();
|
||||
expect(PRICING['gemini-2.5-pro']).toBeDefined();
|
||||
});
|
||||
|
||||
test('missingTools reports unsupported tools per provider', () => {
|
||||
// GPT/Codex doesn't expose Edit, Glob, Grep
|
||||
expect(missingTools('gpt', ['Edit', 'Glob', 'Grep'])).toEqual(['Edit', 'Glob', 'Grep']);
|
||||
// Claude supports all core tools
|
||||
expect(missingTools('claude', ['Edit', 'Glob', 'Grep', 'Bash', 'Read'])).toEqual([]);
|
||||
// Gemini has very limited agentic surface
|
||||
expect(missingTools('gemini', ['Bash', 'Edit'])).toEqual(['Bash', 'Edit']);
|
||||
});
|
||||
|
||||
test('TOOL_COMPATIBILITY is populated for all three families', () => {
|
||||
expect(TOOL_COMPATIBILITY.claude).toBeDefined();
|
||||
expect(TOOL_COMPATIBILITY.gpt).toBeDefined();
|
||||
expect(TOOL_COMPATIBILITY.gemini).toBeDefined();
|
||||
});
|
||||
|
||||
test('formatTable handles a report with mixed success/error/unavailable entries', () => {
|
||||
const report: BenchmarkReport = {
|
||||
prompt: 'test prompt',
|
||||
workdir: '/tmp',
|
||||
startedAt: '2026-04-16T20:00:00Z',
|
||||
durationMs: 1500,
|
||||
entries: [
|
||||
{
|
||||
provider: 'claude',
|
||||
family: 'claude',
|
||||
available: true,
|
||||
result: {
|
||||
output: 'ok',
|
||||
tokens: { input: 100, output: 200 },
|
||||
durationMs: 800,
|
||||
toolCalls: 3,
|
||||
modelUsed: 'claude-opus-4-7',
|
||||
},
|
||||
costUsd: 0.0165,
|
||||
qualityScore: 9.2,
|
||||
},
|
||||
{
|
||||
provider: 'gpt',
|
||||
family: 'gpt',
|
||||
available: true,
|
||||
result: {
|
||||
output: '',
|
||||
tokens: { input: 0, output: 0 },
|
||||
durationMs: 200,
|
||||
toolCalls: 0,
|
||||
modelUsed: 'gpt-5.4',
|
||||
error: { code: 'auth', reason: 'codex login required' },
|
||||
},
|
||||
},
|
||||
{
|
||||
provider: 'gemini',
|
||||
family: 'gemini',
|
||||
available: false,
|
||||
unavailable_reason: 'gemini CLI not on PATH',
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const table = formatTable(report);
|
||||
expect(table).toContain('claude-opus-4-7');
|
||||
expect(table).toContain('ERROR auth');
|
||||
expect(table).toContain('unavailable');
|
||||
expect(table).toContain('9.2/10');
|
||||
});
|
||||
|
||||
test('formatJson produces parseable JSON', () => {
|
||||
const report: BenchmarkReport = {
|
||||
prompt: 'x',
|
||||
workdir: '/tmp',
|
||||
startedAt: '2026-04-16T20:00:00Z',
|
||||
durationMs: 100,
|
||||
entries: [],
|
||||
};
|
||||
const json = formatJson(report);
|
||||
const parsed = JSON.parse(json);
|
||||
expect(parsed.prompt).toBe('x');
|
||||
expect(parsed.entries).toEqual([]);
|
||||
});
|
||||
|
||||
test('formatMarkdown produces a table header', () => {
|
||||
const report: BenchmarkReport = {
|
||||
prompt: 'x',
|
||||
workdir: '/tmp',
|
||||
startedAt: '2026-04-16T20:00:00Z',
|
||||
durationMs: 100,
|
||||
entries: [],
|
||||
};
|
||||
const md = formatMarkdown(report);
|
||||
expect(md).toContain('# Benchmark report');
|
||||
expect(md).toContain('| Model | Latency |');
|
||||
});
|
||||
Reference in New Issue
Block a user