Initial import from garrytan/gstack@026751e (main snapshot via local relay)
Some checks failed
Workflow Lint / actionlint (push) Has been cancelled
Build CI Image / build (push) Has been cancelled
Skill Docs Freshness / check-freshness (push) Has been cancelled
Periodic Evals / build-image (push) Has been cancelled
Periodic Evals / evals (map[file:test/codex-e2e.test.ts name:e2e-codex]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/gemini-e2e.test.ts name:e2e-gemini]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-design.test.ts name:e2e-design]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-plan.test.ts name:e2e-plan]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-qa-bugs.test.ts name:e2e-qa-bugs]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-qa-workflow.test.ts name:e2e-qa-workflow]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-review.test.ts name:e2e-review]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-e2e-workflow.test.ts name:e2e-workflow]) (push) Has been cancelled
Periodic Evals / evals (map[file:test/skill-routing-e2e.test.ts name:e2e-routing]) (push) Has been cancelled

Source: https://github.com/garrytan/gstack/commit/026751e
This commit is contained in:
Rocky
2026-05-19 21:18:17 +02:00
commit 834c6db075
797 changed files with 267839 additions and 0 deletions

View File

@@ -0,0 +1,122 @@
import type { ProviderAdapter, RunOpts, RunResult, AvailabilityCheck } from './types';
import { estimateCostUsd } from '../pricing';
import { execFileSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import { resolveClaudeCommand } from '../../../browse/src/claude-bin';
/**
* Claude adapter — wraps the `claude` CLI via claude -p.
*
* For brevity and to avoid duplicating the full stream-json parser, this adapter
* uses claude CLI in non-interactive mode (--print) with the simpler JSON output
* format. If richer event-level metrics are needed (per-tool timing etc.),
* swap to session-runner's full stream-json parser.
*/
export class ClaudeAdapter implements ProviderAdapter {
readonly name = 'claude';
readonly family = 'claude' as const;
async available(): Promise<AvailabilityCheck> {
// Binary on PATH (or GSTACK_CLAUDE_BIN override). Routes through the shared
// resolver so Windows + override paths behave the same as production sites.
const resolved = resolveClaudeCommand();
if (!resolved) {
return { ok: false, reason: 'claude CLI not found on PATH. Install from https://claude.ai/download or npm i -g @anthropic-ai/claude-code (or set GSTACK_CLAUDE_BIN)' };
}
// Auth sniff: ~/.claude/.credentials.json OR ANTHROPIC_API_KEY
const credsPath = path.join(os.homedir(), '.claude', '.credentials.json');
const hasCreds = fs.existsSync(credsPath);
const hasKey = !!process.env.ANTHROPIC_API_KEY;
if (!hasCreds && !hasKey) {
return { ok: false, reason: 'No Claude auth found. Log in via `claude` interactive session, or export ANTHROPIC_API_KEY.' };
}
return { ok: true };
}
async run(opts: RunOpts): Promise<RunResult> {
const start = Date.now();
const resolved = resolveClaudeCommand();
if (!resolved) {
throw new Error('claude CLI not resolvable (set GSTACK_CLAUDE_BIN or install)');
}
const args = [...resolved.argsPrefix, '-p', '--output-format', 'json'];
if (opts.model) args.push('--model', opts.model);
if (opts.extraArgs) args.push(...opts.extraArgs);
try {
const out = execFileSync(resolved.command, args, {
input: opts.prompt,
cwd: opts.workdir,
timeout: opts.timeoutMs,
encoding: 'utf-8',
maxBuffer: 32 * 1024 * 1024,
});
const parsed = this.parseOutput(out);
return {
output: parsed.output,
tokens: parsed.tokens,
durationMs: Date.now() - start,
toolCalls: parsed.toolCalls,
modelUsed: parsed.modelUsed || opts.model || 'claude-opus-4-7',
};
} catch (err: unknown) {
const durationMs = Date.now() - start;
const e = err as { code?: string; stderr?: Buffer; signal?: string; message?: string };
const stderr = e.stderr?.toString() ?? '';
if (e.signal === 'SIGTERM' || e.code === 'ETIMEDOUT') {
return this.emptyResult(durationMs, { code: 'timeout', reason: `exceeded ${opts.timeoutMs}ms` }, opts.model);
}
if (/unauthorized|auth|login/i.test(stderr)) {
return this.emptyResult(durationMs, { code: 'auth', reason: stderr.slice(0, 400) }, opts.model);
}
if (/rate[- ]?limit|429/i.test(stderr)) {
return this.emptyResult(durationMs, { code: 'rate_limit', reason: stderr.slice(0, 400) }, opts.model);
}
return this.emptyResult(durationMs, { code: 'unknown', reason: (e.message ?? stderr ?? 'unknown').slice(0, 400) }, opts.model);
}
}
estimateCost(tokens: { input: number; output: number; cached?: number }, model?: string): number {
return estimateCostUsd(tokens, model ?? 'claude-opus-4-7');
}
/**
* Parse claude -p --output-format json output. Shape (as of 2026-04):
* { type: "result", result: "<assistant text>", usage: { input_tokens, output_tokens, ... },
* num_turns, session_id, ... }
* Older formats may differ — adapter is best-effort.
*/
private parseOutput(raw: string): { output: string; tokens: { input: number; output: number; cached?: number }; toolCalls: number; modelUsed?: string } {
try {
const obj = JSON.parse(raw);
const result = typeof obj.result === 'string' ? obj.result : String(obj.result ?? '');
const u = obj.usage ?? {};
return {
output: result,
tokens: {
input: u.input_tokens ?? 0,
output: u.output_tokens ?? 0,
cached: u.cache_read_input_tokens,
},
toolCalls: obj.num_turns ?? 0,
modelUsed: obj.model,
};
} catch {
// Non-JSON output: treat as plain text.
return { output: raw, tokens: { input: 0, output: 0 }, toolCalls: 0 };
}
}
private emptyResult(durationMs: number, error: RunResult['error'], model?: string): RunResult {
return {
output: '',
tokens: { input: 0, output: 0 },
durationMs,
toolCalls: 0,
modelUsed: model ?? 'claude-opus-4-7',
error,
};
}
}

View File

@@ -0,0 +1,125 @@
import type { ProviderAdapter, RunOpts, RunResult, AvailabilityCheck } from './types';
import { estimateCostUsd } from '../pricing';
import { execFileSync, spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
/**
* Gemini adapter — wraps the `gemini` CLI.
*
* Gemini CLI auth comes from either ~/.config/gemini/ or GOOGLE_API_KEY. Output
* format is NDJSON with `message`/`tool_use`/`result` events when `--output-format
* stream-json` is requested. This adapter uses a single-response form for simplicity
* in benchmarks; richer streaming lives in gemini-session-runner.ts.
*/
export class GeminiAdapter implements ProviderAdapter {
readonly name = 'gemini';
readonly family = 'gemini' as const;
async available(): Promise<AvailabilityCheck> {
const res = spawnSync('sh', ['-c', 'command -v gemini'], { timeout: 2000 });
if (res.status !== 0) {
return { ok: false, reason: 'gemini CLI not found on PATH. Install per https://github.com/google-gemini/gemini-cli' };
}
const legacyCfgDir = path.join(os.homedir(), '.config', 'gemini');
const newCfgDir = path.join(os.homedir(), '.gemini');
const newOauth = path.join(newCfgDir, 'oauth_creds.json');
const hasCfg = fs.existsSync(legacyCfgDir) || fs.existsSync(newOauth);
const hasKey = !!process.env.GOOGLE_API_KEY;
if (!hasCfg && !hasKey) {
return { ok: false, reason: 'No Gemini auth found. Log in via `gemini login` or export GOOGLE_API_KEY.' };
}
return { ok: true };
}
async run(opts: RunOpts): Promise<RunResult> {
const start = Date.now();
// Default to --yolo (non-interactive) and stream-json output so we can parse
// tokens + tool calls. Callers can override via extraArgs.
const args = ['-p', opts.prompt, '--output-format', 'stream-json', '--yolo'];
if (opts.model) args.push('--model', opts.model);
if (opts.extraArgs) args.push(...opts.extraArgs);
try {
const out = execFileSync('gemini', args, {
cwd: opts.workdir,
timeout: opts.timeoutMs,
encoding: 'utf-8',
maxBuffer: 32 * 1024 * 1024,
});
const parsed = this.parseStreamJson(out);
return {
output: parsed.output,
tokens: parsed.tokens,
durationMs: Date.now() - start,
toolCalls: parsed.toolCalls,
modelUsed: parsed.modelUsed || opts.model || 'gemini-2.5-pro',
};
} catch (err: unknown) {
const durationMs = Date.now() - start;
const e = err as { code?: string; stderr?: Buffer; signal?: string; message?: string };
const stderr = e.stderr?.toString() ?? '';
if (e.signal === 'SIGTERM' || e.code === 'ETIMEDOUT') {
return this.emptyResult(durationMs, { code: 'timeout', reason: `exceeded ${opts.timeoutMs}ms` }, opts.model);
}
if (/unauthorized|auth|login|api key/i.test(stderr)) {
return this.emptyResult(durationMs, { code: 'auth', reason: stderr.slice(0, 400) }, opts.model);
}
if (/rate[- ]?limit|429|quota/i.test(stderr)) {
return this.emptyResult(durationMs, { code: 'rate_limit', reason: stderr.slice(0, 400) }, opts.model);
}
return this.emptyResult(durationMs, { code: 'unknown', reason: (e.message ?? stderr ?? 'unknown').slice(0, 400) }, opts.model);
}
}
estimateCost(tokens: { input: number; output: number; cached?: number }, model?: string): number {
return estimateCostUsd(tokens, model ?? 'gemini-2.5-pro');
}
/**
* Parse gemini NDJSON stream events:
* init → session id (discarded here)
* message { delta: true, text } → concat to output
* tool_use { name } → increment toolCalls
* result { usage: { input_token_count, output_token_count } } → tokens
*/
private parseStreamJson(raw: string): { output: string; tokens: { input: number; output: number }; toolCalls: number; modelUsed?: string } {
let output = '';
let input = 0;
let out = 0;
let toolCalls = 0;
let modelUsed: string | undefined;
for (const line of raw.split('\n')) {
const s = line.trim();
if (!s) continue;
try {
const obj = JSON.parse(s);
if (obj.type === 'message' && typeof obj.text === 'string') {
output += obj.text;
} else if (obj.type === 'tool_use') {
toolCalls += 1;
} else if (obj.type === 'result') {
const u = obj.usage ?? {};
input += u.input_token_count ?? u.prompt_tokens ?? 0;
out += u.output_token_count ?? u.completion_tokens ?? 0;
if (obj.model) modelUsed = obj.model;
}
} catch {
// skip malformed lines
}
}
return { output, tokens: { input, output: out }, toolCalls, modelUsed };
}
private emptyResult(durationMs: number, error: RunResult['error'], model?: string): RunResult {
return {
output: '',
tokens: { input: 0, output: 0 },
durationMs,
toolCalls: 0,
modelUsed: model ?? 'gemini-2.5-pro',
error,
};
}
}

View File

@@ -0,0 +1,127 @@
import type { ProviderAdapter, RunOpts, RunResult, AvailabilityCheck } from './types';
import { estimateCostUsd } from '../pricing';
import { execFileSync, spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
/**
* GPT adapter — wraps the OpenAI `codex` CLI (codex exec with --json output).
*
* Codex uses ~/.codex/ for auth (not OPENAI_API_KEY). The --json flag emits
* JSONL events; we parse `turn.completed` for usage and `agent_message` / etc.
* for output aggregation.
*/
export class GptAdapter implements ProviderAdapter {
readonly name = 'gpt';
readonly family = 'gpt' as const;
async available(): Promise<AvailabilityCheck> {
const res = spawnSync('sh', ['-c', 'command -v codex'], { timeout: 2000 });
if (res.status !== 0) {
return { ok: false, reason: 'codex CLI not found on PATH. Install: npm i -g @openai/codex' };
}
// Auth sniff: ~/.codex/ should contain auth state after `codex login`
const codexDir = path.join(os.homedir(), '.codex');
if (!fs.existsSync(codexDir)) {
return { ok: false, reason: 'No ~/.codex/ found. Run `codex login` to authenticate via ChatGPT.' };
}
return { ok: true };
}
async run(opts: RunOpts): Promise<RunResult> {
const start = Date.now();
// `-s read-only` is load-bearing safety. With `--skip-git-repo-check` we
// bypass codex's interactive trust prompt for unknown directories (benchmarks
// often run in temp dirs / non-git paths), so the read-only sandbox is now
// the only boundary preventing codex from mutating the workdir. If you ever
// remove `-s read-only`, drop `--skip-git-repo-check` too.
const args = ['exec', opts.prompt, '-C', opts.workdir, '-s', 'read-only', '--skip-git-repo-check', '--json'];
if (opts.model) args.push('-m', opts.model);
if (opts.extraArgs) args.push(...opts.extraArgs);
try {
const out = execFileSync('codex', args, {
cwd: opts.workdir,
timeout: opts.timeoutMs,
encoding: 'utf-8',
maxBuffer: 32 * 1024 * 1024,
});
const parsed = this.parseJsonl(out);
return {
output: parsed.output,
tokens: parsed.tokens,
durationMs: Date.now() - start,
toolCalls: parsed.toolCalls,
modelUsed: parsed.modelUsed || opts.model || 'gpt-5.4',
};
} catch (err: unknown) {
const durationMs = Date.now() - start;
const e = err as { code?: string; stderr?: Buffer; signal?: string; message?: string };
const stderr = e.stderr?.toString() ?? '';
if (e.signal === 'SIGTERM' || e.code === 'ETIMEDOUT') {
return this.emptyResult(durationMs, { code: 'timeout', reason: `exceeded ${opts.timeoutMs}ms` }, opts.model);
}
if (/unauthorized|auth|login/i.test(stderr)) {
return this.emptyResult(durationMs, { code: 'auth', reason: stderr.slice(0, 400) }, opts.model);
}
if (/rate[- ]?limit|429/i.test(stderr)) {
return this.emptyResult(durationMs, { code: 'rate_limit', reason: stderr.slice(0, 400) }, opts.model);
}
return this.emptyResult(durationMs, { code: 'unknown', reason: (e.message ?? stderr ?? 'unknown').slice(0, 400) }, opts.model);
}
}
estimateCost(tokens: { input: number; output: number; cached?: number }, model?: string): number {
return estimateCostUsd(tokens, model ?? 'gpt-5.4');
}
/**
* Parse codex exec --json JSONL stream.
* Key events:
* - item.completed with item.type === 'agent_message' → text output
* - item.completed with item.type === 'command_execution' → tool call
* - turn.completed → usage.input_tokens, usage.output_tokens
* - thread.started → session id (not used here)
*/
private parseJsonl(raw: string): { output: string; tokens: { input: number; output: number }; toolCalls: number; modelUsed?: string } {
let output = '';
let input = 0;
let out = 0;
let toolCalls = 0;
let modelUsed: string | undefined;
for (const line of raw.split('\n')) {
const s = line.trim();
if (!s) continue;
try {
const obj = JSON.parse(s);
if (obj.type === 'item.completed' && obj.item) {
if (obj.item.type === 'agent_message' && typeof obj.item.text === 'string') {
output += (output ? '\n' : '') + obj.item.text;
} else if (obj.item.type === 'command_execution') {
toolCalls += 1;
}
} else if (obj.type === 'turn.completed') {
const u = obj.usage ?? {};
input += u.input_tokens ?? 0;
out += u.output_tokens ?? 0;
if (obj.model) modelUsed = obj.model;
}
} catch {
// skip malformed lines — codex stderr can leak in
}
}
return { output, tokens: { input, output: out }, toolCalls, modelUsed };
}
private emptyResult(durationMs: number, error: RunResult['error'], model?: string): RunResult {
return {
output: '',
tokens: { input: 0, output: 0 },
durationMs,
toolCalls: 0,
modelUsed: model ?? 'gpt-5.4',
error,
};
}
}

View File

@@ -0,0 +1,74 @@
/**
* Provider adapter interface — uniform contract for Claude, GPT, Gemini.
*
* Each adapter wraps an existing runner (session-runner.ts, codex-session-runner.ts,
* gemini-session-runner.ts) and normalizes its per-provider result shape into the
* RunResult below. The benchmark harness only talks to adapters through this
* interface, never to the underlying runners directly.
*/
export interface RunOpts {
/** The prompt to send to the model. */
prompt: string;
/** Working directory passed to the underlying CLI. */
workdir: string;
/** Hard wall-clock timeout in ms. Default: 300000 (5 min). */
timeoutMs: number;
/** Specific model within the family, optional. Adapters pass through to provider. */
model?: string;
/** Extra flags per-provider (escape hatch for rare cases). Prefer staying generic. */
extraArgs?: string[];
}
export interface TokenUsage {
input: number;
output: number;
/** Cached input tokens (Anthropic/OpenAI support). Undefined if provider doesn't report. */
cached?: number;
}
export type RunError =
| 'auth' // Credentials missing or invalid.
| 'timeout' // Exceeded timeoutMs.
| 'rate_limit' // Provider rate-limited us; backoff exceeded.
| 'binary_missing' // CLI not found on PATH.
| 'unknown'; // Catch-all with reason populated.
export interface RunResult {
/** Provider's textual output for the prompt. */
output: string;
/** Normalized token usage. 0s if unreported. */
tokens: TokenUsage;
/** Wall-clock duration. */
durationMs: number;
/** Count of tool/function calls made during the run (0 if unsupported). */
toolCalls: number;
/** Actual model ID the provider reports using (may be a variant of the family). */
modelUsed: string;
/** If the run failed, error code + human reason. output/tokens may be partial. */
error?: { code: RunError; reason: string };
}
export interface AvailabilityCheck {
ok: boolean;
/** When !ok: short reason shown to user. Includes install / login / env var hint. */
reason?: string;
}
export type Family = 'claude' | 'gpt' | 'gemini';
export interface ProviderAdapter {
/** Stable name used in output tables and config (e.g., 'claude', 'gpt', 'gemini'). */
readonly name: string;
/** Model family this adapter targets. */
readonly family: Family;
/**
* Check whether the provider's CLI binary is present and authenticated.
* Should never block >2s. Non-throwing: returns { ok: false, reason } on failure.
*/
available(): Promise<AvailabilityCheck>;
/** Run a prompt and return normalized RunResult. Non-throwing. Errors go in result.error. */
run(opts: RunOpts): Promise<RunResult>;
/** Estimate USD cost for the reported token usage and model. */
estimateCost(tokens: TokenUsage, model?: string): number;
}