Initial import from garrytan/gstack@026751e (main snapshot via local relay)

Source: https://github.com/garrytan/gstack/commit/026751e
2026-05-19 21:18:17 +02:00
commit 834c6db075
797 changed files with 267839 additions and 0 deletions
--- a/test/skill-e2e-plan-ceo-mode-routing.test.ts
+++ b/test/skill-e2e-plan-ceo-mode-routing.test.ts
@@ -0,0 +1,212 @@
+/**
+ * /plan-ceo-review mode-routing E2E (periodic, paid, real-PTY).
+ *
+ * Asserts: when /plan-ceo-review reaches its Step 0F mode-selection
+ * AskUserQuestion and the user picks HOLD SCOPE or SCOPE EXPANSION,
+ * the downstream rendered output reflects that mode's distinctive
+ * posture language.
+ *
+ * Why this exists: existing tests verify that the question fires. Nothing
+ * verifies the answer actually routes. A regression where Step 0F shows
+ * the question but the agent ignores the choice (e.g. always defaults
+ * to EXPANSION) would not be caught by any prior test.
+ *
+ * Tier: periodic (not gate). Each run navigates 8-12 prior AskUserQuestions (telemetry,
+ * proactive, routing, vendoring, brain, office-hours, premise×3, approach)
+ * before reaching Step 0F. At ~30s per AskUserQuestion that's a 4-6 min navigation
+ * phase per case. The full 2-case suite runs ~12-15 min, $3-4. Too slow
+ * for gate-tier; weekly is fine.
+ *
+ * Mode coverage: HOLD SCOPE + SCOPE EXPANSION cover the two posture poles
+ * (rigor vs ambition). SELECTIVE EXPANSION and SCOPE REDUCTION are V2 once
+ * the navigation phase is shorter or has a deterministic fast-path through
+ * Step 0A/0C-bis.
+ *
+ * Posture assertions: each mode has distinct downstream language. The
+ * checks below are deliberately permissive — they catch the binary
+ * "did the mode posture even apply" question, not Opus-specific phrasing.
+ *
+ *   HOLD SCOPE        — "rigor" or "bulletproof" or "hold scope"
+ *   SCOPE EXPANSION   — "expansion" or "10x" or "delight" or "dream"
+ */
+
+import { describe, test } from 'bun:test';
+import {
+  launchClaudePty,
+  isNumberedOptionListVisible,
+  isPermissionDialogVisible,
+  parseNumberedOptions,
+  isPlanReadyVisible,
+  MODE_RE,
+  optionsSignature,
+  TAIL_SCAN_BYTES,
+  type ClaudePtySession,
+} from './helpers/claude-pty-runner';
+
+const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
+const describeE2E = shouldRun ? describe : describe.skip;
+
+interface ModeCase {
+  mode: 'HOLD SCOPE' | 'SCOPE EXPANSION';
+  /** Regex applied to visible-since-mode-pick text. At least one must match. */
+  postureRe: RegExp;
+}
+
+const CASES: ModeCase[] = [
+  { mode: 'HOLD SCOPE',      postureRe: /\b(rigor|bulletproof|hold\s*scope|maximum\s+rigor)\b/i },
+  { mode: 'SCOPE EXPANSION', postureRe: /\b(expansion|10x|delight|dream|cathedral|opt[\s-]?in)\b/i },
+];
+
+/**
+ * Navigate prior AskUserQuestions by picking option 1 until we hit an AskUserQuestion whose
+ * options match one of the 4 mode names. Returns the option index
+ * matching `targetMode`, with the buffer marker pointing AT that AskUserQuestion.
+ *
+ * Throws if we don't reach the mode AskUserQuestion within `maxNav` prior AskUserQuestions or
+ * the overall budget.
+ */
+async function navigateToModeAskUserQuestion(
+  session: ClaudePtySession,
+  since: number,
+  targetMode: ModeCase['mode'],
+  opts: { maxNav?: number; budgetMs?: number } = {},
+): Promise<{ modeIndex: number; visibleAtMode: string }> {
+  // /plan-ceo-review's mode AskUserQuestion (Step 0F) sits behind several preamble
+  // and Step 0A-0C-bis gates: telemetry, proactive, routing, vendoring,
+  // brain privacy, office-hours offer, premise challenge (3 questions),
+  // approach selection. 12 hops is the conservative ceiling.
+  const maxNav = opts.maxNav ?? 12;
+  const budgetMs = opts.budgetMs ?? 420_000;
+  const start = Date.now();
+  let priorAnswered = 0;
+  let lastSeenList: Array<{ index: number; label: string }> = [];
+
+  while (Date.now() - start < budgetMs) {
+    if (session.exited()) {
+      throw new Error(
+        `claude exited (code=${session.exitCode()}) during nav.\n` +
+        `Last visible:\n${session.visibleSince(since).slice(-2000)}`,
+      );
+    }
+    await Bun.sleep(2000);
+    const visible = session.visibleSince(since);
+    if (!isNumberedOptionListVisible(visible)) continue;
+    const opts = parseNumberedOptions(visible);
+    if (opts.length < 2) continue;
+
+    // Has the rendered list changed since last poll? If not, we're seeing
+    // the same prompt and shouldn't double-press.
+    const sig = optionsSignature(opts);
+    const lastSig = optionsSignature(lastSeenList);
+    if (sig === lastSig) continue;
+    lastSeenList = opts;
+
+    // Is THIS the mode AskUserQuestion?
+    if (opts.some(o => MODE_RE.test(o.label))) {
+      const target = opts.find(o => o.label.toUpperCase().includes(targetMode));
+      if (!target) {
+        throw new Error(
+          `Mode AskUserQuestion rendered but target "${targetMode}" not in option labels:\n` +
+          opts.map(o => `  ${o.index}. ${o.label}`).join('\n'),
+        );
+      }
+      return { modeIndex: target.index, visibleAtMode: visible };
+    }
+
+    // Permission dialog? Grant with "1" but don't count it against nav budget.
+    // Classify on the recent tail only — old permission text persists in
+    // visibleSince and would re-trigger forever.
+    //
+    // Note: runPlanSkillObservation has its own permission-dialog filter that
+    // simply skips classification (since it observes, doesn't drive). This nav
+    // loop drives the PTY directly via launchClaudePty and so owns its own
+    // dialog handling — granting with "1" so the workflow advances. Both
+    // paths share TAIL_SCAN_BYTES as the recent-tail window so tuning stays
+    // in sync.
+    if (isPermissionDialogVisible(visible.slice(-TAIL_SCAN_BYTES))) {
+      session.send('1\r');
+      await Bun.sleep(1500);
+      continue;
+    }
+
+    // Not the mode AskUserQuestion — answer with option 1 (recommended) and continue.
+    if (priorAnswered >= maxNav) {
+      throw new Error(
+        `Navigated ${maxNav} prior AskUserQuestions without reaching the mode AskUserQuestion. ` +
+        `Last list:\n${opts.map(o => `  ${o.index}. ${o.label}`).join('\n')}`,
+      );
+    }
+    priorAnswered++;
+    session.send('1\r');
+    // Give the agent a beat to advance before re-polling.
+    await Bun.sleep(2000);
+  }
+  throw new Error(`Mode AskUserQuestion not reached within ${budgetMs}ms`);
+}
+
+describeE2E('/plan-ceo-review mode routing (gate)', () => {
+  for (const c of CASES) {
+    test(
+      `mode "${c.mode}" routes to its distinctive posture`,
+      async () => {
+        const session = await launchClaudePty({
+          permissionMode: 'plan',
+          timeoutMs: 540_000,
+        });
+        try {
+          await Bun.sleep(8000);
+          const since = session.mark();
+          session.send('/plan-ceo-review\r');
+
+          const { modeIndex } = await navigateToModeAskUserQuestion(session, since, c.mode);
+
+          // Snapshot the visible buffer at mode-pick time, then send the index.
+          const sincePick = session.rawOutput().length;
+          session.send(`${modeIndex}\r`);
+
+          // Wait for downstream evidence: either next AskUserQuestion or plan_ready or
+          // a posture-distinctive substring shows up.
+          const budgetMs = 240_000;
+          const start = Date.now();
+          let postureMatched = false;
+          let downstreamSnapshot = '';
+          while (Date.now() - start < budgetMs) {
+            await Bun.sleep(2500);
+            if (session.exited()) {
+              throw new Error(
+                `claude exited (code=${session.exitCode()}) after mode pick.\n` +
+                `Downstream:\n${session.visibleSince(sincePick).slice(-2000)}`,
+              );
+            }
+            downstreamSnapshot = session.visibleSince(sincePick);
+            if (c.postureRe.test(downstreamSnapshot)) {
+              postureMatched = true;
+              break;
+            }
+            // Don't bail early on plan_ready alone — the posture text may
+            // arrive as the agent finishes writing the plan. Only break
+            // once we either match posture or run the clock.
+            if (
+              isPlanReadyVisible(downstreamSnapshot) &&
+              isNumberedOptionListVisible(downstreamSnapshot) &&
+              !c.postureRe.test(downstreamSnapshot)
+            ) {
+              // Plan-ready AND a follow-up AskUserQuestion are both visible but
+              // posture text has not appeared yet. Keep polling for a bit.
+            }
+          }
+          if (!postureMatched) {
+            throw new Error(
+              `Mode "${c.mode}" routing FAILED: no posture match for ${c.postureRe.source}.\n` +
+              `--- downstream visible since mode pick (last 3KB) ---\n` +
+              downstreamSnapshot.slice(-3000),
+            );
+          }
+        } finally {
+          await session.close();
+        }
+      },
+      600_000,
+    );
+  }
+});