Initial import from garrytan/gstack@026751e (main snapshot via local relay)

Source: https://github.com/garrytan/gstack/commit/026751e
2026-05-19 21:18:17 +02:00
commit 834c6db075
797 changed files with 267839 additions and 0 deletions
--- a/test/helpers/providers/claude.ts
+++ b/test/helpers/providers/claude.ts
@@ -0,0 +1,122 @@
+import type { ProviderAdapter, RunOpts, RunResult, AvailabilityCheck } from './types';
+import { estimateCostUsd } from '../pricing';
+import { execFileSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { resolveClaudeCommand } from '../../../browse/src/claude-bin';
+
+/**
+ * Claude adapter — wraps the `claude` CLI via claude -p.
+ *
+ * For brevity and to avoid duplicating the full stream-json parser, this adapter
+ * uses claude CLI in non-interactive mode (--print) with the simpler JSON output
+ * format. If richer event-level metrics are needed (per-tool timing etc.),
+ * swap to session-runner's full stream-json parser.
+ */
+export class ClaudeAdapter implements ProviderAdapter {
+  readonly name = 'claude';
+  readonly family = 'claude' as const;
+
+  async available(): Promise<AvailabilityCheck> {
+    // Binary on PATH (or GSTACK_CLAUDE_BIN override). Routes through the shared
+    // resolver so Windows + override paths behave the same as production sites.
+    const resolved = resolveClaudeCommand();
+    if (!resolved) {
+      return { ok: false, reason: 'claude CLI not found on PATH. Install from https://claude.ai/download or npm i -g @anthropic-ai/claude-code (or set GSTACK_CLAUDE_BIN)' };
+    }
+    // Auth sniff: ~/.claude/.credentials.json OR ANTHROPIC_API_KEY
+    const credsPath = path.join(os.homedir(), '.claude', '.credentials.json');
+    const hasCreds = fs.existsSync(credsPath);
+    const hasKey = !!process.env.ANTHROPIC_API_KEY;
+    if (!hasCreds && !hasKey) {
+      return { ok: false, reason: 'No Claude auth found. Log in via `claude` interactive session, or export ANTHROPIC_API_KEY.' };
+    }
+    return { ok: true };
+  }
+
+  async run(opts: RunOpts): Promise<RunResult> {
+    const start = Date.now();
+    const resolved = resolveClaudeCommand();
+    if (!resolved) {
+      throw new Error('claude CLI not resolvable (set GSTACK_CLAUDE_BIN or install)');
+    }
+    const args = [...resolved.argsPrefix, '-p', '--output-format', 'json'];
+    if (opts.model) args.push('--model', opts.model);
+    if (opts.extraArgs) args.push(...opts.extraArgs);
+
+    try {
+      const out = execFileSync(resolved.command, args, {
+        input: opts.prompt,
+        cwd: opts.workdir,
+        timeout: opts.timeoutMs,
+        encoding: 'utf-8',
+        maxBuffer: 32 * 1024 * 1024,
+      });
+      const parsed = this.parseOutput(out);
+      return {
+        output: parsed.output,
+        tokens: parsed.tokens,
+        durationMs: Date.now() - start,
+        toolCalls: parsed.toolCalls,
+        modelUsed: parsed.modelUsed || opts.model || 'claude-opus-4-7',
+      };
+    } catch (err: unknown) {
+      const durationMs = Date.now() - start;
+      const e = err as { code?: string; stderr?: Buffer; signal?: string; message?: string };
+      const stderr = e.stderr?.toString() ?? '';
+      if (e.signal === 'SIGTERM' || e.code === 'ETIMEDOUT') {
+        return this.emptyResult(durationMs, { code: 'timeout', reason: `exceeded ${opts.timeoutMs}ms` }, opts.model);
+      }
+      if (/unauthorized|auth|login/i.test(stderr)) {
+        return this.emptyResult(durationMs, { code: 'auth', reason: stderr.slice(0, 400) }, opts.model);
+      }
+      if (/rate[- ]?limit|429/i.test(stderr)) {
+        return this.emptyResult(durationMs, { code: 'rate_limit', reason: stderr.slice(0, 400) }, opts.model);
+      }
+      return this.emptyResult(durationMs, { code: 'unknown', reason: (e.message ?? stderr ?? 'unknown').slice(0, 400) }, opts.model);
+    }
+  }
+
+  estimateCost(tokens: { input: number; output: number; cached?: number }, model?: string): number {
+    return estimateCostUsd(tokens, model ?? 'claude-opus-4-7');
+  }
+
+  /**
+   * Parse claude -p --output-format json output. Shape (as of 2026-04):
+   *   { type: "result", result: "<assistant text>", usage: { input_tokens, output_tokens, ... },
+   *     num_turns, session_id, ... }
+   * Older formats may differ — adapter is best-effort.
+   */
+  private parseOutput(raw: string): { output: string; tokens: { input: number; output: number; cached?: number }; toolCalls: number; modelUsed?: string } {
+    try {
+      const obj = JSON.parse(raw);
+      const result = typeof obj.result === 'string' ? obj.result : String(obj.result ?? '');
+      const u = obj.usage ?? {};
+      return {
+        output: result,
+        tokens: {
+          input: u.input_tokens ?? 0,
+          output: u.output_tokens ?? 0,
+          cached: u.cache_read_input_tokens,
+        },
+        toolCalls: obj.num_turns ?? 0,
+        modelUsed: obj.model,
+      };
+    } catch {
+      // Non-JSON output: treat as plain text.
+      return { output: raw, tokens: { input: 0, output: 0 }, toolCalls: 0 };
+    }
+  }
+
+  private emptyResult(durationMs: number, error: RunResult['error'], model?: string): RunResult {
+    return {
+      output: '',
+      tokens: { input: 0, output: 0 },
+      durationMs,
+      toolCalls: 0,
+      modelUsed: model ?? 'claude-opus-4-7',
+      error,
+    };
+  }
+}
--- a/test/helpers/providers/gemini.ts
+++ b/test/helpers/providers/gemini.ts
@@ -0,0 +1,125 @@
+import type { ProviderAdapter, RunOpts, RunResult, AvailabilityCheck } from './types';
+import { estimateCostUsd } from '../pricing';
+import { execFileSync, spawnSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+/**
+ * Gemini adapter — wraps the `gemini` CLI.
+ *
+ * Gemini CLI auth comes from either ~/.config/gemini/ or GOOGLE_API_KEY. Output
+ * format is NDJSON with `message`/`tool_use`/`result` events when `--output-format
+ * stream-json` is requested. This adapter uses a single-response form for simplicity
+ * in benchmarks; richer streaming lives in gemini-session-runner.ts.
+ */
+export class GeminiAdapter implements ProviderAdapter {
+  readonly name = 'gemini';
+  readonly family = 'gemini' as const;
+
+  async available(): Promise<AvailabilityCheck> {
+    const res = spawnSync('sh', ['-c', 'command -v gemini'], { timeout: 2000 });
+    if (res.status !== 0) {
+      return { ok: false, reason: 'gemini CLI not found on PATH. Install per https://github.com/google-gemini/gemini-cli' };
+    }
+    const legacyCfgDir = path.join(os.homedir(), '.config', 'gemini');
+    const newCfgDir = path.join(os.homedir(), '.gemini');
+    const newOauth = path.join(newCfgDir, 'oauth_creds.json');
+    const hasCfg = fs.existsSync(legacyCfgDir) || fs.existsSync(newOauth);
+    const hasKey = !!process.env.GOOGLE_API_KEY;
+    if (!hasCfg && !hasKey) {
+      return { ok: false, reason: 'No Gemini auth found. Log in via `gemini login` or export GOOGLE_API_KEY.' };
+    }
+    return { ok: true };
+  }
+
+  async run(opts: RunOpts): Promise<RunResult> {
+    const start = Date.now();
+    // Default to --yolo (non-interactive) and stream-json output so we can parse
+    // tokens + tool calls. Callers can override via extraArgs.
+    const args = ['-p', opts.prompt, '--output-format', 'stream-json', '--yolo'];
+    if (opts.model) args.push('--model', opts.model);
+    if (opts.extraArgs) args.push(...opts.extraArgs);
+
+    try {
+      const out = execFileSync('gemini', args, {
+        cwd: opts.workdir,
+        timeout: opts.timeoutMs,
+        encoding: 'utf-8',
+        maxBuffer: 32 * 1024 * 1024,
+      });
+      const parsed = this.parseStreamJson(out);
+      return {
+        output: parsed.output,
+        tokens: parsed.tokens,
+        durationMs: Date.now() - start,
+        toolCalls: parsed.toolCalls,
+        modelUsed: parsed.modelUsed || opts.model || 'gemini-2.5-pro',
+      };
+    } catch (err: unknown) {
+      const durationMs = Date.now() - start;
+      const e = err as { code?: string; stderr?: Buffer; signal?: string; message?: string };
+      const stderr = e.stderr?.toString() ?? '';
+      if (e.signal === 'SIGTERM' || e.code === 'ETIMEDOUT') {
+        return this.emptyResult(durationMs, { code: 'timeout', reason: `exceeded ${opts.timeoutMs}ms` }, opts.model);
+      }
+      if (/unauthorized|auth|login|api key/i.test(stderr)) {
+        return this.emptyResult(durationMs, { code: 'auth', reason: stderr.slice(0, 400) }, opts.model);
+      }
+      if (/rate[- ]?limit|429|quota/i.test(stderr)) {
+        return this.emptyResult(durationMs, { code: 'rate_limit', reason: stderr.slice(0, 400) }, opts.model);
+      }
+      return this.emptyResult(durationMs, { code: 'unknown', reason: (e.message ?? stderr ?? 'unknown').slice(0, 400) }, opts.model);
+    }
+  }
+
+  estimateCost(tokens: { input: number; output: number; cached?: number }, model?: string): number {
+    return estimateCostUsd(tokens, model ?? 'gemini-2.5-pro');
+  }
+
+  /**
+   * Parse gemini NDJSON stream events:
+   *   init  → session id (discarded here)
+   *   message { delta: true, text } → concat to output
+   *   tool_use { name } → increment toolCalls
+   *   result { usage: { input_token_count, output_token_count } } → tokens
+   */
+  private parseStreamJson(raw: string): { output: string; tokens: { input: number; output: number }; toolCalls: number; modelUsed?: string } {
+    let output = '';
+    let input = 0;
+    let out = 0;
+    let toolCalls = 0;
+    let modelUsed: string | undefined;
+    for (const line of raw.split('\n')) {
+      const s = line.trim();
+      if (!s) continue;
+      try {
+        const obj = JSON.parse(s);
+        if (obj.type === 'message' && typeof obj.text === 'string') {
+          output += obj.text;
+        } else if (obj.type === 'tool_use') {
+          toolCalls += 1;
+        } else if (obj.type === 'result') {
+          const u = obj.usage ?? {};
+          input += u.input_token_count ?? u.prompt_tokens ?? 0;
+          out += u.output_token_count ?? u.completion_tokens ?? 0;
+          if (obj.model) modelUsed = obj.model;
+        }
+      } catch {
+        // skip malformed lines
+      }
+    }
+    return { output, tokens: { input, output: out }, toolCalls, modelUsed };
+  }
+
+  private emptyResult(durationMs: number, error: RunResult['error'], model?: string): RunResult {
+    return {
+      output: '',
+      tokens: { input: 0, output: 0 },
+      durationMs,
+      toolCalls: 0,
+      modelUsed: model ?? 'gemini-2.5-pro',
+      error,
+    };
+  }
+}
--- a/test/helpers/providers/gpt.ts
+++ b/test/helpers/providers/gpt.ts
@@ -0,0 +1,127 @@
+import type { ProviderAdapter, RunOpts, RunResult, AvailabilityCheck } from './types';
+import { estimateCostUsd } from '../pricing';
+import { execFileSync, spawnSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+/**
+ * GPT adapter — wraps the OpenAI `codex` CLI (codex exec with --json output).
+ *
+ * Codex uses ~/.codex/ for auth (not OPENAI_API_KEY). The --json flag emits
+ * JSONL events; we parse `turn.completed` for usage and `agent_message` / etc.
+ * for output aggregation.
+ */
+export class GptAdapter implements ProviderAdapter {
+  readonly name = 'gpt';
+  readonly family = 'gpt' as const;
+
+  async available(): Promise<AvailabilityCheck> {
+    const res = spawnSync('sh', ['-c', 'command -v codex'], { timeout: 2000 });
+    if (res.status !== 0) {
+      return { ok: false, reason: 'codex CLI not found on PATH. Install: npm i -g @openai/codex' };
+    }
+    // Auth sniff: ~/.codex/ should contain auth state after `codex login`
+    const codexDir = path.join(os.homedir(), '.codex');
+    if (!fs.existsSync(codexDir)) {
+      return { ok: false, reason: 'No ~/.codex/ found. Run `codex login` to authenticate via ChatGPT.' };
+    }
+    return { ok: true };
+  }
+
+  async run(opts: RunOpts): Promise<RunResult> {
+    const start = Date.now();
+    // `-s read-only` is load-bearing safety. With `--skip-git-repo-check` we
+    // bypass codex's interactive trust prompt for unknown directories (benchmarks
+    // often run in temp dirs / non-git paths), so the read-only sandbox is now
+    // the only boundary preventing codex from mutating the workdir. If you ever
+    // remove `-s read-only`, drop `--skip-git-repo-check` too.
+    const args = ['exec', opts.prompt, '-C', opts.workdir, '-s', 'read-only', '--skip-git-repo-check', '--json'];
+    if (opts.model) args.push('-m', opts.model);
+    if (opts.extraArgs) args.push(...opts.extraArgs);
+
+    try {
+      const out = execFileSync('codex', args, {
+        cwd: opts.workdir,
+        timeout: opts.timeoutMs,
+        encoding: 'utf-8',
+        maxBuffer: 32 * 1024 * 1024,
+      });
+      const parsed = this.parseJsonl(out);
+      return {
+        output: parsed.output,
+        tokens: parsed.tokens,
+        durationMs: Date.now() - start,
+        toolCalls: parsed.toolCalls,
+        modelUsed: parsed.modelUsed || opts.model || 'gpt-5.4',
+      };
+    } catch (err: unknown) {
+      const durationMs = Date.now() - start;
+      const e = err as { code?: string; stderr?: Buffer; signal?: string; message?: string };
+      const stderr = e.stderr?.toString() ?? '';
+      if (e.signal === 'SIGTERM' || e.code === 'ETIMEDOUT') {
+        return this.emptyResult(durationMs, { code: 'timeout', reason: `exceeded ${opts.timeoutMs}ms` }, opts.model);
+      }
+      if (/unauthorized|auth|login/i.test(stderr)) {
+        return this.emptyResult(durationMs, { code: 'auth', reason: stderr.slice(0, 400) }, opts.model);
+      }
+      if (/rate[- ]?limit|429/i.test(stderr)) {
+        return this.emptyResult(durationMs, { code: 'rate_limit', reason: stderr.slice(0, 400) }, opts.model);
+      }
+      return this.emptyResult(durationMs, { code: 'unknown', reason: (e.message ?? stderr ?? 'unknown').slice(0, 400) }, opts.model);
+    }
+  }
+
+  estimateCost(tokens: { input: number; output: number; cached?: number }, model?: string): number {
+    return estimateCostUsd(tokens, model ?? 'gpt-5.4');
+  }
+
+  /**
+   * Parse codex exec --json JSONL stream.
+   * Key events:
+   *   - item.completed with item.type === 'agent_message' → text output
+   *   - item.completed with item.type === 'command_execution' → tool call
+   *   - turn.completed → usage.input_tokens, usage.output_tokens
+   *   - thread.started → session id (not used here)
+   */
+  private parseJsonl(raw: string): { output: string; tokens: { input: number; output: number }; toolCalls: number; modelUsed?: string } {
+    let output = '';
+    let input = 0;
+    let out = 0;
+    let toolCalls = 0;
+    let modelUsed: string | undefined;
+    for (const line of raw.split('\n')) {
+      const s = line.trim();
+      if (!s) continue;
+      try {
+        const obj = JSON.parse(s);
+        if (obj.type === 'item.completed' && obj.item) {
+          if (obj.item.type === 'agent_message' && typeof obj.item.text === 'string') {
+            output += (output ? '\n' : '') + obj.item.text;
+          } else if (obj.item.type === 'command_execution') {
+            toolCalls += 1;
+          }
+        } else if (obj.type === 'turn.completed') {
+          const u = obj.usage ?? {};
+          input += u.input_tokens ?? 0;
+          out += u.output_tokens ?? 0;
+          if (obj.model) modelUsed = obj.model;
+        }
+      } catch {
+        // skip malformed lines — codex stderr can leak in
+      }
+    }
+    return { output, tokens: { input, output: out }, toolCalls, modelUsed };
+  }
+
+  private emptyResult(durationMs: number, error: RunResult['error'], model?: string): RunResult {
+    return {
+      output: '',
+      tokens: { input: 0, output: 0 },
+      durationMs,
+      toolCalls: 0,
+      modelUsed: model ?? 'gpt-5.4',
+      error,
+    };
+  }
+}
--- a/test/helpers/providers/types.ts
+++ b/test/helpers/providers/types.ts
@@ -0,0 +1,74 @@
+/**
+ * Provider adapter interface — uniform contract for Claude, GPT, Gemini.
+ *
+ * Each adapter wraps an existing runner (session-runner.ts, codex-session-runner.ts,
+ * gemini-session-runner.ts) and normalizes its per-provider result shape into the
+ * RunResult below. The benchmark harness only talks to adapters through this
+ * interface, never to the underlying runners directly.
+ */
+
+export interface RunOpts {
+  /** The prompt to send to the model. */
+  prompt: string;
+  /** Working directory passed to the underlying CLI. */
+  workdir: string;
+  /** Hard wall-clock timeout in ms. Default: 300000 (5 min). */
+  timeoutMs: number;
+  /** Specific model within the family, optional. Adapters pass through to provider. */
+  model?: string;
+  /** Extra flags per-provider (escape hatch for rare cases). Prefer staying generic. */
+  extraArgs?: string[];
+}
+
+export interface TokenUsage {
+  input: number;
+  output: number;
+  /** Cached input tokens (Anthropic/OpenAI support). Undefined if provider doesn't report. */
+  cached?: number;
+}
+
+export type RunError =
+  | 'auth'       // Credentials missing or invalid.
+  | 'timeout'    // Exceeded timeoutMs.
+  | 'rate_limit' // Provider rate-limited us; backoff exceeded.
+  | 'binary_missing' // CLI not found on PATH.
+  | 'unknown';   // Catch-all with reason populated.
+
+export interface RunResult {
+  /** Provider's textual output for the prompt. */
+  output: string;
+  /** Normalized token usage. 0s if unreported. */
+  tokens: TokenUsage;
+  /** Wall-clock duration. */
+  durationMs: number;
+  /** Count of tool/function calls made during the run (0 if unsupported). */
+  toolCalls: number;
+  /** Actual model ID the provider reports using (may be a variant of the family). */
+  modelUsed: string;
+  /** If the run failed, error code + human reason. output/tokens may be partial. */
+  error?: { code: RunError; reason: string };
+}
+
+export interface AvailabilityCheck {
+  ok: boolean;
+  /** When !ok: short reason shown to user. Includes install / login / env var hint. */
+  reason?: string;
+}
+
+export type Family = 'claude' | 'gpt' | 'gemini';
+
+export interface ProviderAdapter {
+  /** Stable name used in output tables and config (e.g., 'claude', 'gpt', 'gemini'). */
+  readonly name: string;
+  /** Model family this adapter targets. */
+  readonly family: Family;
+  /**
+   * Check whether the provider's CLI binary is present and authenticated.
+   * Should never block >2s. Non-throwing: returns { ok: false, reason } on failure.
+   */
+  available(): Promise<AvailabilityCheck>;
+  /** Run a prompt and return normalized RunResult. Non-throwing. Errors go in result.error. */
+  run(opts: RunOpts): Promise<RunResult>;
+  /** Estimate USD cost for the reported token usage and model. */
+  estimateCost(tokens: TokenUsage, model?: string): number;
+}