refactor(e2e): extract infrastructure into tests/e2e/infra/ module

- Move fixtures.ts, llm-judge.ts, results-db.ts to tests/e2e/infra/ - Deduplicate wdio.conf.js: use createTestFixture() instead of inline copy - Replace __dirname paths with projectRoot-anchored paths - Create test-mode-constants.ts (typed env var names, flag registry) - Create scripts/preflight-check.sh (validates tauri-driver, display, Claude CLI) - Create scripts/check-test-flags.sh (CI lint for AGOR_TEST flag drift) - Rewrite tests/e2e/README.md with full documentation - Update spec imports for moved infra files
2026-03-18 03:06:57 +01:00 · 2026-03-18 03:06:57 +01:00 · e76bc341f2
commit e76bc341f2
parent 538a31f85c
10 changed files with 235 additions and 191 deletions
--- a/tests/e2e/infra/fixtures.ts
+++ b/tests/e2e/infra/fixtures.ts
@ -0,0 +1,142 @@
+// Test fixture generator — creates isolated test environments
+// Used by E2E tests to set up temp data/config dirs with valid groups.json
+
+import { mkdirSync, writeFileSync, rmSync, existsSync } from 'node:fs';
+import { join } from 'node:path';
+import { execSync } from 'node:child_process';
+import { tmpdir } from 'node:os';
+
+export interface TestFixture {
+  /** Root temp directory for this test run */
+  rootDir: string;
+  /** AGOR_TEST_DATA_DIR — isolated data dir */
+  dataDir: string;
+  /** AGOR_TEST_CONFIG_DIR — isolated config dir */
+  configDir: string;
+  /** Path to a minimal git repo for agent testing */
+  projectDir: string;
+  /** Environment variables to pass to the app */
+  env: Record<string, string>;
+}
+
+/**
+ * Create an isolated test fixture with:
+ * - Temp data dir (sessions.db, btmsg.db created at runtime)
+ * - Temp config dir with a minimal groups.json
+ * - A simple git repo with one file for agent testing
+ */
+export function createTestFixture(name = 'agor-e2e'): TestFixture {
+  const rootDir = join(tmpdir(), `${name}-${Date.now()}`);
+  const dataDir = join(rootDir, 'data');
+  const configDir = join(rootDir, 'config');
+  const projectDir = join(rootDir, 'test-project');
+
+  // Create directory structure
+  mkdirSync(dataDir, { recursive: true });
+  mkdirSync(configDir, { recursive: true });
+  mkdirSync(projectDir, { recursive: true });
+
+  // Create a minimal git repo for agent testing
+  execSync('git init', { cwd: projectDir, stdio: 'ignore' });
+  execSync('git config user.email "test@agor.dev"', { cwd: projectDir, stdio: 'ignore' });
+  execSync('git config user.name "Agor Test"', { cwd: projectDir, stdio: 'ignore' });
+  writeFileSync(join(projectDir, 'README.md'), '# Test Project\n\nA simple test project for Agor E2E tests.\n');
+  writeFileSync(join(projectDir, 'hello.py'), 'def greet(name: str) -> str:\n    return f"Hello, {name}!"\n');
+  execSync('git add -A && git commit -m "initial commit"', { cwd: projectDir, stdio: 'ignore' });
+
+  // Write groups.json with one group containing the test project
+  const groupsJson = {
+    version: 1,
+    groups: [
+      {
+        id: 'test-group',
+        name: 'Test Group',
+        projects: [
+          {
+            id: 'test-project',
+            name: 'Test Project',
+            identifier: 'test-project',
+            description: 'E2E test project',
+            icon: '\uf120',
+            cwd: projectDir,
+            profile: 'default',
+            enabled: true,
+          },
+        ],
+        agents: [],
+      },
+    ],
+    activeGroupId: 'test-group',
+  };
+
+  writeFileSync(
+    join(configDir, 'groups.json'),
+    JSON.stringify(groupsJson, null, 2),
+  );
+
+  const env: Record<string, string> = {
+    AGOR_TEST: '1',
+    AGOR_TEST_DATA_DIR: dataDir,
+    AGOR_TEST_CONFIG_DIR: configDir,
+  };
+
+  return { rootDir, dataDir, configDir, projectDir, env };
+}
+
+/**
+ * Clean up a test fixture's temporary directories.
+ */
+export function destroyTestFixture(fixture: TestFixture): void {
+  if (existsSync(fixture.rootDir)) {
+    rmSync(fixture.rootDir, { recursive: true, force: true });
+  }
+}
+
+/**
+ * Create a groups.json with multiple projects for multi-project testing.
+ */
+export function createMultiProjectFixture(projectCount = 3): TestFixture {
+  const fixture = createTestFixture('agor-multi');
+
+  const projects = [];
+  for (let i = 0; i < projectCount; i++) {
+    const projDir = join(fixture.rootDir, `project-${i}`);
+    mkdirSync(projDir, { recursive: true });
+    execSync('git init', { cwd: projDir, stdio: 'ignore' });
+    execSync('git config user.email "test@agor.dev"', { cwd: projDir, stdio: 'ignore' });
+    execSync('git config user.name "Agor Test"', { cwd: projDir, stdio: 'ignore' });
+    writeFileSync(join(projDir, 'README.md'), `# Project ${i}\n`);
+    execSync('git add -A && git commit -m "init"', { cwd: projDir, stdio: 'ignore' });
+
+    projects.push({
+      id: `project-${i}`,
+      name: `Project ${i}`,
+      identifier: `project-${i}`,
+      description: `Test project ${i}`,
+      icon: '\uf120',
+      cwd: projDir,
+      profile: 'default',
+      enabled: true,
+    });
+  }
+
+  const groupsJson = {
+    version: 1,
+    groups: [
+      {
+        id: 'multi-group',
+        name: 'Multi Project Group',
+        projects,
+        agents: [],
+      },
+    ],
+    activeGroupId: 'multi-group',
+  };
+
+  writeFileSync(
+    join(fixture.configDir, 'groups.json'),
+    JSON.stringify(groupsJson, null, 2),
+  );
+
+  return fixture;
+}
--- a/tests/e2e/infra/llm-judge.ts
+++ b/tests/e2e/infra/llm-judge.ts
@ -0,0 +1,231 @@
+// LLM Judge — evaluates test outcomes via Claude.
+//
+// Two backends, configurable via LLM_JUDGE_BACKEND env var:
+//   "cli"  — Claude CLI (default, no API key needed)
+//   "api"  — Anthropic REST API (requires ANTHROPIC_API_KEY)
+//
+// CLI backend: spawns `claude` with --output-format text, parses JSON verdict.
+// API backend: raw fetch to messages API, same JSON verdict parsing.
+//
+// Skips gracefully when neither backend is available.
+
+import { execFileSync, execSync } from 'node:child_process';
+import { existsSync } from 'node:fs';
+
+const MODEL = 'claude-haiku-4-5-20251001';
+const API_URL = 'https://api.anthropic.com/v1/messages';
+const MAX_TOKENS = 512;
+
+// CLI search paths (in order)
+const CLI_PATHS = [
+  `${process.env.HOME}/.local/bin/claude`,
+  `${process.env.HOME}/.claude/local/claude`,
+  '/usr/local/bin/claude',
+  '/usr/bin/claude',
+];
+
+export type JudgeBackend = 'cli' | 'api';
+
+export interface JudgeVerdict {
+  pass: boolean;
+  reasoning: string;
+  confidence: number; // 0-1
+}
+
+/**
+ * Find the Claude CLI binary path, or null if not installed.
+ */
+function findClaudeCli(): string | null {
+  for (const p of CLI_PATHS) {
+    if (existsSync(p)) return p;
+  }
+  // Fallback: check PATH
+  try {
+    const which = execSync('which claude 2>/dev/null', { encoding: 'utf-8' }).trim();
+    if (which) return which;
+  } catch {
+    // not found
+  }
+  return null;
+}
+
+/**
+ * Determine which backend to use.
+ * Env var LLM_JUDGE_BACKEND overrides auto-detection.
+ * Auto: CLI if available, then API if key set, else null.
+ */
+function resolveBackend(): JudgeBackend | null {
+  const explicit = process.env.LLM_JUDGE_BACKEND?.toLowerCase();
+  if (explicit === 'cli') return findClaudeCli() ? 'cli' : null;
+  if (explicit === 'api') return process.env.ANTHROPIC_API_KEY ? 'api' : null;
+
+  // Auto-detect: CLI first, API fallback
+  if (findClaudeCli()) return 'cli';
+  if (process.env.ANTHROPIC_API_KEY) return 'api';
+  return null;
+}
+
+/**
+ * Check if the LLM judge is available (CLI installed or API key set).
+ */
+export function isJudgeAvailable(): boolean {
+  return resolveBackend() !== null;
+}
+
+/**
+ * Build the prompt for the judge.
+ */
+function buildPrompt(criteria: string, actual: string, context?: string): { system: string; user: string } {
+  const system = `You are a test assertion judge for a terminal emulator application called BTerminal.
+Your job is to evaluate whether actual output from the application meets the given criteria.
+Respond with EXACTLY this JSON format, nothing else:
+{"pass": true/false, "reasoning": "brief explanation", "confidence": 0.0-1.0}`;
+
+  const user = [
+    '## Criteria',
+    criteria,
+    '',
+    '## Actual Output',
+    actual,
+    ...(context ? ['', '## Additional Context', context] : []),
+    '',
+    'Does the actual output satisfy the criteria? Respond with JSON only.',
+  ].join('\n');
+
+  return { system, user };
+}
+
+/**
+ * Extract and validate a JudgeVerdict from raw text output.
+ */
+function parseVerdict(text: string): JudgeVerdict {
+  const jsonMatch = text.match(/\{[\s\S]*\}/);
+  if (!jsonMatch) {
+    throw new Error(`LLM judge returned non-JSON: ${text}`);
+  }
+
+  const verdict = JSON.parse(jsonMatch[0]) as JudgeVerdict;
+
+  if (typeof verdict.pass !== 'boolean') {
+    throw new Error(`LLM judge returned invalid verdict: ${text}`);
+  }
+  verdict.confidence = Number(verdict.confidence) || 0;
+  verdict.reasoning = String(verdict.reasoning || '');
+
+  return verdict;
+}
+
+/**
+ * Judge via Claude CLI (spawns subprocess).
+ * Unsets CLAUDECODE to avoid nested session errors.
+ */
+async function judgeCli(
+  criteria: string,
+  actual: string,
+  context?: string,
+): Promise<JudgeVerdict> {
+  const cliPath = findClaudeCli();
+  if (!cliPath) throw new Error('Claude CLI not found');
+
+  const { system, user } = buildPrompt(criteria, actual, context);
+
+  const output = execFileSync(cliPath, [
+    '-p', user,
+    '--model', MODEL,
+    '--output-format', 'text',
+    '--system-prompt', system,
+    '--setting-sources', 'user',   // skip project CLAUDE.md
+  ], {
+    encoding: 'utf-8',
+    timeout: 60_000,
+    cwd: '/tmp',                   // avoid loading project CLAUDE.md
+    env: { ...process.env, CLAUDECODE: '' },
+    maxBuffer: 1024 * 1024,
+  });
+
+  return parseVerdict(output);
+}
+
+/**
+ * Judge via Anthropic REST API (raw fetch).
+ */
+async function judgeApi(
+  criteria: string,
+  actual: string,
+  context?: string,
+): Promise<JudgeVerdict> {
+  const apiKey = process.env.ANTHROPIC_API_KEY;
+  if (!apiKey) throw new Error('ANTHROPIC_API_KEY not set');
+
+  const { system, user } = buildPrompt(criteria, actual, context);
+
+  const response = await fetch(API_URL, {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json',
+      'x-api-key': apiKey,
+      'anthropic-version': '2023-06-01',
+    },
+    body: JSON.stringify({
+      model: MODEL,
+      max_tokens: MAX_TOKENS,
+      system,
+      messages: [{ role: 'user', content: user }],
+    }),
+  });
+
+  if (!response.ok) {
+    const body = await response.text();
+    throw new Error(`Anthropic API error ${response.status}: ${body}`);
+  }
+
+  const data = await response.json();
+  const text = data.content?.[0]?.text ?? '';
+
+  return parseVerdict(text);
+}
+
+/**
+ * Ask Claude to evaluate whether `actual` output satisfies `criteria`.
+ *
+ * Uses CLI backend by default, falls back to API. Override with
+ * LLM_JUDGE_BACKEND env var ("cli" or "api").
+ *
+ * Returns a structured verdict with pass/fail, reasoning, and confidence.
+ * Throws if no backend available or call fails.
+ */
+export async function judge(
+  criteria: string,
+  actual: string,
+  context?: string,
+): Promise<JudgeVerdict> {
+  const backend = resolveBackend();
+  if (!backend) {
+    throw new Error('LLM judge unavailable — no Claude CLI found and ANTHROPIC_API_KEY not set');
+  }
+
+  if (backend === 'cli') {
+    return judgeCli(criteria, actual, context);
+  }
+  return judgeApi(criteria, actual, context);
+}
+
+/**
+ * Convenience: judge with a minimum confidence threshold.
+ * Returns pass=true only if verdict.pass=true AND confidence >= threshold.
+ */
+export async function assertWithJudge(
+  criteria: string,
+  actual: string,
+  options: { context?: string; minConfidence?: number } = {},
+): Promise<JudgeVerdict> {
+  const { context, minConfidence = 0.7 } = options;
+  const verdict = await judge(criteria, actual, context);
+
+  if (verdict.pass && verdict.confidence < minConfidence) {
+    verdict.pass = false;
+    verdict.reasoning += ` (confidence ${verdict.confidence} below threshold ${minConfidence})`;
+  }
+
+  return verdict;
+}
--- a/tests/e2e/infra/results-db.ts
+++ b/tests/e2e/infra/results-db.ts
@ -0,0 +1,113 @@
+// Test results store — persists test run outcomes as JSON for analysis
+// No native deps needed — reads/writes a JSON file
+
+import { resolve, dirname } from 'node:path';
+import { mkdirSync, readFileSync, writeFileSync, existsSync } from 'node:fs';
+import { fileURLToPath } from 'node:url';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const DEFAULT_PATH = resolve(__dirname, '../../test-results/results.json');
+
+export interface TestRunRow {
+  run_id: string;
+  started_at: string;
+  finished_at: string | null;
+  status: 'running' | 'passed' | 'failed' | 'error';
+  total_tests: number;
+  passed_tests: number;
+  failed_tests: number;
+  duration_ms: number | null;
+  git_branch: string | null;
+  git_sha: string | null;
+}
+
+export interface TestStepRow {
+  run_id: string;
+  scenario_name: string;
+  step_name: string;
+  status: 'passed' | 'failed' | 'skipped' | 'error';
+  duration_ms: number | null;
+  error_message: string | null;
+  screenshot_path: string | null;
+  agent_cost_usd: number | null;
+  created_at: string;
+}
+
+interface ResultsStore {
+  runs: TestRunRow[];
+  steps: TestStepRow[];
+}
+
+export class ResultsDb {
+  private filePath: string;
+  private store: ResultsStore;
+
+  constructor(filePath = DEFAULT_PATH) {
+    this.filePath = filePath;
+    mkdirSync(dirname(filePath), { recursive: true });
+    this.store = this.load();
+  }
+
+  private load(): ResultsStore {
+    if (existsSync(this.filePath)) {
+      try {
+        return JSON.parse(readFileSync(this.filePath, 'utf-8'));
+      } catch {
+        return { runs: [], steps: [] };
+      }
+    }
+    return { runs: [], steps: [] };
+  }
+
+  private save(): void {
+    writeFileSync(this.filePath, JSON.stringify(this.store, null, 2));
+  }
+
+  startRun(runId: string, gitBranch?: string, gitSha?: string): void {
+    this.store.runs.push({
+      run_id: runId,
+      started_at: new Date().toISOString(),
+      finished_at: null,
+      status: 'running',
+      total_tests: 0,
+      passed_tests: 0,
+      failed_tests: 0,
+      duration_ms: null,
+      git_branch: gitBranch ?? null,
+      git_sha: gitSha ?? null,
+    });
+    this.save();
+  }
+
+  finishRun(runId: string, status: 'passed' | 'failed' | 'error', durationMs: number): void {
+    const run = this.store.runs.find(r => r.run_id === runId);
+    if (!run) return;
+
+    const steps = this.store.steps.filter(s => s.run_id === runId);
+    run.finished_at = new Date().toISOString();
+    run.status = status;
+    run.duration_ms = durationMs;
+    run.total_tests = steps.length;
+    run.passed_tests = steps.filter(s => s.status === 'passed').length;
+    run.failed_tests = steps.filter(s => s.status === 'failed' || s.status === 'error').length;
+    this.save();
+  }
+
+  recordStep(step: Omit<TestStepRow, 'created_at'>): void {
+    this.store.steps.push({
+      ...step,
+      created_at: new Date().toISOString(),
+    });
+    this.save();
+  }
+
+  getRecentRuns(limit = 20): TestRunRow[] {
+    return this.store.runs
+      .sort((a, b) => b.started_at.localeCompare(a.started_at))
+      .slice(0, limit);
+  }
+
+  getStepsForRun(runId: string): TestStepRow[] {
+    return this.store.steps.filter(s => s.run_id === runId);
+  }
+}
--- a/tests/e2e/infra/test-mode-constants.ts
+++ b/tests/e2e/infra/test-mode-constants.ts
@ -0,0 +1,23 @@
+// Typed constants for test-mode environment variables.
+// Single source of truth for env var names — prevents string literal duplication.
+//
+// These env vars are read by:
+//   Rust:  agor-core/src/config.rs (AppConfig::from_env)
+//          src-tauri/src/commands/misc.rs (is_test_mode)
+//          src-tauri/src/lib.rs (setup: skip CLI install, forward to sidecar)
+//          src-tauri/src/watcher.rs (disable file watcher)
+//          src-tauri/src/fs_watcher.rs (disable fs watcher)
+//          src-tauri/src/telemetry.rs (disable OTLP)
+//   Svelte: src/App.svelte (disable wake scheduler)
+
+/** Main test mode flag — set to '1' to enable test isolation */
+export const AGOR_TEST = 'AGOR_TEST';
+
+/** Override data directory (sessions.db, btmsg.db, search.db) */
+export const AGOR_TEST_DATA_DIR = 'AGOR_TEST_DATA_DIR';
+
+/** Override config directory (groups.json, plugins/) */
+export const AGOR_TEST_CONFIG_DIR = 'AGOR_TEST_CONFIG_DIR';
+
+/** All test-mode env vars for iteration */
+export const TEST_ENV_VARS = [AGOR_TEST, AGOR_TEST_DATA_DIR, AGOR_TEST_CONFIG_DIR] as const;