refactor(e2e): extract infrastructure into tests/e2e/infra/ module

- Move fixtures.ts, llm-judge.ts, results-db.ts to tests/e2e/infra/ - Deduplicate wdio.conf.js: use createTestFixture() instead of inline copy - Replace __dirname paths with projectRoot-anchored paths - Create test-mode-constants.ts (typed env var names, flag registry) - Create scripts/preflight-check.sh (validates tauri-driver, display, Claude CLI) - Create scripts/check-test-flags.sh (CI lint for AGOR_TEST flag drift) - Rewrite tests/e2e/README.md with full documentation - Update spec imports for moved infra files
2026-03-18 03:06:57 +01:00 · 2026-03-18 03:06:57 +01:00 · e76bc341f2
commit e76bc341f2
parent 538a31f85c
10 changed files with 235 additions and 191 deletions
--- a/scripts/check-test-flags.sh
+++ b/scripts/check-test-flags.sh
@ -0,0 +1,50 @@
 #!/usr/bin/env bash
 # Lint check: verify all AGOR_TEST references are documented.
 # Run in CI to catch flag drift between code and documentation.
 set -euo pipefail
 echo "Checking AGOR_TEST flag references..."
 # Known files that should reference AGOR_TEST (from test-mode-constants.ts)
 KNOWN_FILES=(
  "agor-core/src/config.rs"
  "src-tauri/src/commands/misc.rs"
  "src-tauri/src/lib.rs"
  "src-tauri/src/watcher.rs"
  "src-tauri/src/fs_watcher.rs"
  "src-tauri/src/telemetry.rs"
  "src/App.svelte"
  "tests/e2e/wdio.conf.js"
  "tests/e2e/infra/fixtures.ts"
  "tests/e2e/infra/test-mode-constants.ts"
 )
 # Find all files referencing AGOR_TEST (excluding node_modules, target, .git)
 FOUND=$(grep -rl 'AGOR_TEST' --include='*.rs' --include='*.ts' --include='*.js' --include='*.svelte' \
  --exclude-dir=node_modules --exclude-dir=target --exclude-dir=.git . 2>/dev/null | \
  sed 's|^\./||' | sort)
 UNKNOWN=""
 for f in $FOUND; do
  MATCH=0
  for k in "${KNOWN_FILES[@]}"; do
    if [[ "$f" == "$k" ]]; then
      MATCH=1
      break
    fi
  done
  if [[ $MATCH -eq 0 ]]; then
    UNKNOWN="$UNKNOWN  $f\n"
  fi
 done
 if [[ -n "$UNKNOWN" ]]; then
  echo ""
  echo "WARNING: AGOR_TEST referenced in files not in the known registry:"
  echo -e "$UNKNOWN"
  echo "Update tests/e2e/infra/test-mode-constants.ts and this script."
  exit 1
 else
  echo "All AGOR_TEST references are documented. ✓"
 fi
--- a/scripts/preflight-check.sh
+++ b/scripts/preflight-check.sh
@ -0,0 +1,68 @@
 #!/usr/bin/env bash
 # Preflight check for E2E tests — validates system dependencies.
 # Run before E2E suite to catch missing tools early.
 set -euo pipefail
 OK=0
 WARN=0
 check() {
  local name="$1" cmd="$2"
  if eval "$cmd" >/dev/null 2>&1; then
    echo "  ✓ $name"
  else
    echo "  ✗ $name"
    return 1
  fi
 }
 echo "E2E Preflight Check"
 echo "==================="
 echo ""
 echo "Required:"
 if ! check "tauri-driver" "command -v tauri-driver"; then
  echo "    Install: cargo install tauri-driver"
  OK=1
 fi
 if ! check "debug binary" "test -f target/debug/agent-orchestrator"; then
  echo "    Build: cargo tauri build --debug --no-bundle"
  OK=1
 fi
 # Display server (Linux only)
 if [[ "$(uname)" == "Linux" ]]; then
  if [[ -n "${DISPLAY:-}" ]] || [[ -n "${WAYLAND_DISPLAY:-}" ]]; then
    echo "  ✓ display server (DISPLAY=$DISPLAY)"
  else
    echo "  ✗ display server — no DISPLAY or WAYLAND_DISPLAY set"
    echo "    Use: xvfb-run --auto-servernum npm run test:e2e"
    OK=1
  fi
 fi
 echo ""
 echo "Optional (LLM judge):"
 if command -v claude >/dev/null 2>&1; then
  echo "  ✓ Claude CLI ($(which claude))"
 elif [[ -n "${ANTHROPIC_API_KEY:-}" ]]; then
  echo "  ✓ ANTHROPIC_API_KEY set"
 else
  echo "  ~ Claude CLI not found, ANTHROPIC_API_KEY not set"
  echo "    LLM-judged tests will be skipped"
  WARN=1
 fi
 echo ""
 if [[ $OK -ne 0 ]]; then
  echo "FAILED — missing required dependencies"
  exit 1
 elif [[ $WARN -ne 0 ]]; then
  echo "PASSED with warnings"
 else
  echo "ALL CHECKS PASSED"
 fi
--- a/tests/e2e/README.md
+++ b/tests/e2e/README.md
@ -1,143 +1,88 @@
-# E2E Tests (WebDriver)
+# E2E Testing Module
-Tauri apps use the WebDriver protocol for E2E testing (not Playwright directly).
+Browser automation tests for Agent Orchestrator using WebDriverIO + tauri-driver.
 The app runs inside WebKit2GTK on Linux, so tests interact with the real WebView.
-## Prerequisites
+## Quick Start
 - Rust toolchain (for building the Tauri app)
 - Display server (X11 or Wayland) — headless Xvfb works for CI
 - `tauri-driver` installed: `cargo install tauri-driver`
 - `webkit2gtk-driver` system package: `sudo apt install webkit2gtk-driver`
 - npm devDeps already in package.json (`@wdio/cli`, `@wdio/local-runner`, `@wdio/mocha-framework`, `@wdio/spec-reporter`)
 ## Running
 ```bash
-# From v2/ directory — builds debug binary automatically, spawns tauri-driver
+# Preflight check (validates dependencies)
-npm run test:e2e
+./scripts/preflight-check.sh
-# Skip rebuild (use existing binary)
+# Build debug binary + run E2E
 npm run test:all:e2e
 # Run E2E only (skip build)
 SKIP_BUILD=1 npm run test:e2e
-# With test isolation (custom data/config dirs)
+# Headless (CI)
-AGOR_TEST_DATA_DIR=/tmp/bt-test/data AGOR_TEST_CONFIG_DIR=/tmp/bt-test/config npm run test:e2e
+xvfb-run --auto-servernum npm run test:e2e
 ```
-The `wdio.conf.js` handles:
+## System Dependencies
 1. Building the debug binary (`cargo tauri build --debug --no-bundle`) in `onPrepare`
 2. Spawning `tauri-driver` before each session (TCP readiness probe, 10s deadline)
 3. Killing `tauri-driver` after each session
 4. Passing `AGOR_TEST=1` env var to the app for test mode isolation
-## Test Mode (`AGOR_TEST=1`)
+| Tool | Required | Install |
 |------|----------|---------|
 | tauri-driver | Yes | `cargo install tauri-driver` |
 | Debug binary | Yes | `cargo tauri build --debug --no-bundle` |
 | X11/Wayland | Yes (Linux) | Use `xvfb-run` in CI |
 | Claude CLI | Optional | LLM-judged tests skip if absent |
 | ANTHROPIC_API_KEY | Optional | Alternative to Claude CLI for LLM judge |
-When `AGOR_TEST=1` is set:
+## Directory Structure
 - File watchers (watcher.rs, fs_watcher.rs) are disabled to avoid inotify noise
 - Wake scheduler is disabled (no auto-wake timers)
 - Data/config directories can be overridden via `AGOR_TEST_DATA_DIR` / `AGOR_TEST_CONFIG_DIR`
 ## CI setup (headless)
 ```bash
 # Install virtual framebuffer + WebKit driver
 sudo apt install xvfb webkit2gtk-driver
 # Run with Xvfb wrapper
 xvfb-run npm run test:e2e
 ```
 ## Writing tests
 Tests use WebdriverIO with Mocha. Specs go in `specs/`:
 ```typescript
 import { browser, expect } from '@wdio/globals';
 describe('BTerminal', () => {
  it('should show the status bar', async () => {
    const statusBar = await browser.$('[data-testid="status-bar"]');
    await expect(statusBar).toBeDisplayed();
  });
 });
 ```
 ### Stable selectors
 Prefer `data-testid` attributes over CSS class selectors:
 | Element | Selector |
 |---------|----------|
 | Status bar | `[data-testid="status-bar"]` |
 | Sidebar rail | `[data-testid="sidebar-rail"]` |
 | Settings button | `[data-testid="settings-btn"]` |
 | Project box | `[data-testid="project-box"]` |
 | Project ID | `[data-project-id="..."]` |
 | Project tabs | `[data-testid="project-tabs"]` |
 | Agent session | `[data-testid="agent-session"]` |
 | Agent pane | `[data-testid="agent-pane"]` |
 | Agent status | `[data-agent-status="idle\|running\|..."]` |
 | Agent messages | `[data-testid="agent-messages"]` |
 | Agent prompt | `[data-testid="agent-prompt"]` |
 | Agent submit | `[data-testid="agent-submit"]` |
 | Agent stop | `[data-testid="agent-stop"]` |
 | Terminal tabs | `[data-testid="terminal-tabs"]` |
 | Add tab button | `[data-testid="tab-add"]` |
 | Terminal toggle | `[data-testid="terminal-toggle"]` |
 | Command palette | `[data-testid="command-palette"]` |
 | Palette input | `[data-testid="palette-input"]` |
 ### Key constraints
 - `maxInstances: 1` — Tauri doesn't support parallel WebDriver sessions
 - Mocha timeout is 60s — the app needs time to initialize
 - Tests interact with the real WebKit2GTK WebView, not a browser
 - Use `browser.execute()` for JS clicks when WebDriver clicks don't trigger Svelte handlers
 - Agent tests (Scenario 7) require a real Claude CLI install + API key — they skip gracefully if unavailable
 ## Test infrastructure
 ### Fixtures (`fixtures.ts`)
 Creates isolated test environments with temp data/config dirs and git repos:
 ```typescript
 import { createTestFixture, destroyTestFixture } from '../fixtures';
 const fixture = createTestFixture('my-test');
 // fixture.dataDir, fixture.configDir, fixture.projectDir, fixture.env
 destroyTestFixture(fixture);
 ```
 ### Results DB (`results-db.ts`)
 JSON-based test results store for tracking runs and steps:
 ```typescript
 import { ResultsDb } from '../results-db';
 const db = new ResultsDb();
 db.startRun('run-001', 'v2-mission-control', 'abc123');
 db.recordStep({ run_id: 'run-001', scenario_name: 'Smoke', step_name: 'renders', status: 'passed', ... });
 db.finishRun('run-001', 'passed', 5000);
 ```
 ## File structure
 ```
 tests/e2e/
-├── README.md                         # This file
+├── wdio.conf.js          # WebDriverIO config + tauri-driver lifecycle
-├── wdio.conf.js                      # WebdriverIO config with tauri-driver lifecycle
+├── tsconfig.json          # TypeScript config for specs
-├── tsconfig.json                     # TypeScript config for test specs
+├── README.md              # This file
-├── fixtures.ts                       # Test fixture generator (isolated environments)
+├── infra/                 # Test infrastructure (not specs)
-├── results-db.ts                     # JSON test results store
+│   ├── fixtures.ts        # Test fixture generator (isolated temp dirs)
-└── specs/
+│   ├── llm-judge.ts       # LLM-based assertion engine (Claude CLI / API)
-    ├── agor.test.ts             # Smoke tests (CSS class selectors, 50+ tests)
+│   ├── results-db.ts      # JSON test results store
-    └── agent-scenarios.test.ts       # Phase A scenarios (data-testid selectors, 22 tests)
+│   └── test-mode-constants.ts  # Typed env var names for test mode
 └── specs/                 # Test specifications
    ├── agor.test.ts       # Smoke + UI tests (50+ tests)
    ├── agent-scenarios.test.ts  # Phase A: agent interaction (22 tests)
    ├── phase-b.test.ts    # Phase B: multi-project + LLM judge
    └── phase-c.test.ts    # Phase C: hardening features (11 scenarios)
 ```
-## References
+## Test Mode Environment Variables
- Tauri WebDriver docs: https://v2.tauri.app/develop/tests/webdriver/
+| Variable | Purpose | Read By |
- WebdriverIO docs: https://webdriver.io/
+|----------|---------|---------|
- tauri-driver: https://crates.io/crates/tauri-driver
+| `AGOR_TEST=1` | Enable test isolation | config.rs, misc.rs, lib.rs, watcher.rs, fs_watcher.rs, telemetry.rs, App.svelte |
 | `AGOR_TEST_DATA_DIR` | Override data dir | config.rs |
 | `AGOR_TEST_CONFIG_DIR` | Override config dir | config.rs |
 **Effects when AGOR_TEST=1:**
 - File watchers disabled (watcher.rs, fs_watcher.rs)
 - OTLP telemetry export disabled (telemetry.rs)
 - CLI tool installation skipped (lib.rs)
 - Wake scheduler disabled (App.svelte)
 - Test env vars forwarded to sidecar processes (lib.rs)
 ## Test Phases
 | Phase | File | Tests | Type |
 |-------|------|-------|------|
 | Smoke | agor.test.ts | 50+ | Deterministic (CSS/DOM assertions) |
 | A | agent-scenarios.test.ts | 22 | Deterministic (data-testid selectors) |
 | B | phase-b.test.ts | 6+ | LLM-judged (multi-project, agent quality) |
 | C | phase-c.test.ts | 11 scenarios | Mixed (deterministic + LLM-judged) |
 ## Adding a New Spec
 1. Create `tests/e2e/specs/my-feature.test.ts`
 2. Import from `@wdio/globals` for `browser` and `expect`
 3. Use `data-testid` selectors (preferred) or CSS classes
 4. Add to `wdio.conf.js` specs array
 5. For LLM assertions: `import { assertWithJudge } from '../infra/llm-judge'`
 6. Run `./scripts/check-test-flags.sh` if you added new AGOR_TEST references
 ## CI Workflow
 See `.github/workflows/e2e.yml` — 3 jobs:
 1. **unit-tests**: vitest frontend
 2. **cargo-tests**: Rust backend
 3. **e2e-tests**: WebDriverIO (xvfb-run, Phase A+B+C, LLM tests gated on secret)
--- a/tests/e2e/infra/fixtures.ts
+++ b/tests/e2e/infra/fixtures.ts
--- a/tests/e2e/infra/llm-judge.ts
+++ b/tests/e2e/infra/llm-judge.ts
--- a/tests/e2e/infra/results-db.ts
+++ b/tests/e2e/infra/results-db.ts
--- a/tests/e2e/infra/test-mode-constants.ts
+++ b/tests/e2e/infra/test-mode-constants.ts
@ -0,0 +1,23 @@
 // Typed constants for test-mode environment variables.
 // Single source of truth for env var names — prevents string literal duplication.
 //
 // These env vars are read by:
 //   Rust:  agor-core/src/config.rs (AppConfig::from_env)
 //          src-tauri/src/commands/misc.rs (is_test_mode)
 //          src-tauri/src/lib.rs (setup: skip CLI install, forward to sidecar)
 //          src-tauri/src/watcher.rs (disable file watcher)
 //          src-tauri/src/fs_watcher.rs (disable fs watcher)
 //          src-tauri/src/telemetry.rs (disable OTLP)
 //   Svelte: src/App.svelte (disable wake scheduler)
 /** Main test mode flag — set to '1' to enable test isolation */
 export const AGOR_TEST = 'AGOR_TEST';
 /** Override data directory (sessions.db, btmsg.db, search.db) */
 export const AGOR_TEST_DATA_DIR = 'AGOR_TEST_DATA_DIR';
 /** Override config directory (groups.json, plugins/) */
 export const AGOR_TEST_CONFIG_DIR = 'AGOR_TEST_CONFIG_DIR';
 /** All test-mode env vars for iteration */
 export const TEST_ENV_VARS = [AGOR_TEST, AGOR_TEST_DATA_DIR, AGOR_TEST_CONFIG_DIR] as const;
--- a/tests/e2e/specs/phase-b.test.ts
+++ b/tests/e2e/specs/phase-b.test.ts
@ -1,5 +1,5 @@
 import { browser, expect } from '@wdio/globals';
-import { isJudgeAvailable, assertWithJudge } from '../llm-judge';
+import { isJudgeAvailable, assertWithJudge } from '../infra/llm-judge';
 // Phase B: Multi-project scenarios + LLM-judged assertions.
 // Extends Phase A with tests that exercise multiple project boxes simultaneously
--- a/tests/e2e/specs/phase-c.test.ts
+++ b/tests/e2e/specs/phase-c.test.ts
@ -1,5 +1,5 @@
 import { browser, expect } from '@wdio/globals';
-import { isJudgeAvailable, assertWithJudge } from '../llm-judge';
+import { isJudgeAvailable, assertWithJudge } from '../infra/llm-judge';
 // Phase C: Hardening feature tests.
 // Tests the v3 production-readiness features added in the hardening sprint:
--- a/tests/e2e/wdio.conf.js
+++ b/tests/e2e/wdio.conf.js
@ -1,68 +1,30 @@
-import { spawn, execSync } from 'node:child_process';
+import { spawn } from 'node:child_process';
 import { createConnection } from 'node:net';
-import { resolve, dirname, join } from 'node:path';
+import { resolve, dirname } from 'node:path';
 import { fileURLToPath } from 'node:url';
-import { mkdirSync, writeFileSync, rmSync } from 'node:fs';
+import { rmSync } from 'node:fs';
-import { tmpdir } from 'node:os';
+import { createTestFixture } from './infra/fixtures.ts';
 const __dirname = dirname(fileURLToPath(import.meta.url));
 const projectRoot = resolve(__dirname, '../..');
-// Debug binary path (built with `cargo tauri build --debug --no-bundle`)
+// Debug binary path (Cargo workspace target at repo root)
 // Cargo workspace target dir is at v2/target/, not v2/src-tauri/target/
 const tauriBinary = resolve(projectRoot, 'target/debug/agent-orchestrator');
 let tauriDriver;
-// ── Test Fixture (created eagerly so env vars are available for capabilities) ──
+// ── Test Fixture ──
-const fixtureRoot = join(tmpdir(), `agor-e2e-${Date.now()}`);
+// IMPORTANT: Must be created at module top-level (synchronously) because the
-const fixtureDataDir = join(fixtureRoot, 'data');
+// capabilities object below references fixtureDataDir/fixtureConfigDir at eval time.
-const fixtureConfigDir = join(fixtureRoot, 'config');
+// tauri:options.env may not reliably set process-level env vars, so we also
-const fixtureProjectDir = join(fixtureRoot, 'test-project');
+// inject into process.env for tauri-driver inheritance.
 const fixture = createTestFixture('agor-e2e');
 mkdirSync(fixtureDataDir, { recursive: true });
 mkdirSync(fixtureConfigDir, { recursive: true });
 mkdirSync(fixtureProjectDir, { recursive: true });
 // Create a minimal git repo for agent testing
 execSync('git init', { cwd: fixtureProjectDir, stdio: 'ignore' });
 execSync('git config user.email "test@agor.dev"', { cwd: fixtureProjectDir, stdio: 'ignore' });
 execSync('git config user.name "Agor Test"', { cwd: fixtureProjectDir, stdio: 'ignore' });
 writeFileSync(join(fixtureProjectDir, 'README.md'), '# Test Project\n\nA simple test project for Agor E2E tests.\n');
 writeFileSync(join(fixtureProjectDir, 'hello.py'), 'def greet(name: str) -> str:\n    return f"Hello, {name}!"\n');
 execSync('git add -A && git commit -m "initial commit"', { cwd: fixtureProjectDir, stdio: 'ignore' });
 // Write groups.json with one group containing the test project
 writeFileSync(
  join(fixtureConfigDir, 'groups.json'),
  JSON.stringify({
    version: 1,
    groups: [{
      id: 'test-group',
      name: 'Test Group',
      projects: [{
        id: 'test-project',
        name: 'Test Project',
        identifier: 'test-project',
        description: 'E2E test project',
        icon: '\uf120',
        cwd: fixtureProjectDir,
        profile: 'default',
        enabled: true,
      }],
      agents: [],
    }],
    activeGroupId: 'test-group',
  }, null, 2),
 );
 // Inject env vars into process.env so tauri-driver inherits them
 // (tauri:options.env may not reliably set process-level env vars)
 process.env.AGOR_TEST = '1';
-process.env.AGOR_TEST_DATA_DIR = fixtureDataDir;
+process.env.AGOR_TEST_DATA_DIR = fixture.dataDir;
-process.env.AGOR_TEST_CONFIG_DIR = fixtureConfigDir;
+process.env.AGOR_TEST_CONFIG_DIR = fixture.configDir;
-console.log(`Test fixture created at ${fixtureRoot}`);
+console.log(`Test fixture created at ${fixture.rootDir}`);
 export const config = {
  // ── Runner ──
@ -78,10 +40,10 @@ export const config = {
  // Single spec file — Tauri launches one app instance per session,
  // and tauri-driver can't re-create sessions between spec files.
  specs: [
-    resolve(__dirname, 'specs/agor.test.ts'),
+    resolve(projectRoot, 'tests/e2e/specs/agor.test.ts'),
-    resolve(__dirname, 'specs/agent-scenarios.test.ts'),
+    resolve(projectRoot, 'tests/e2e/specs/agent-scenarios.test.ts'),
-    resolve(__dirname, 'specs/phase-b.test.ts'),
+    resolve(projectRoot, 'tests/e2e/specs/phase-b.test.ts'),
-    resolve(__dirname, 'specs/phase-c.test.ts'),
+    resolve(projectRoot, 'tests/e2e/specs/phase-c.test.ts'),
  ],
  // ── Capabilities ──
@ -91,11 +53,7 @@ export const config = {
    'tauri:options': {
      application: tauriBinary,
      // Test isolation: fixture-created data/config dirs, disable watchers/telemetry
-      env: {
+      env: fixture.env,
        AGOR_TEST: '1',
        AGOR_TEST_DATA_DIR: fixtureDataDir,
        AGOR_TEST_CONFIG_DIR: fixtureConfigDir,
      },
    },
  }],
@ -199,7 +157,7 @@ export const config = {
    }
    // Clean up test fixture
    try {
-      rmSync(fixtureRoot, { recursive: true, force: true });
+      rmSync(fixture.rootDir, { recursive: true, force: true });
      console.log('Test fixture cleaned up.');
    } catch { /* best-effort cleanup */ }
  },
@ -207,7 +165,7 @@ export const config = {
  // ── TypeScript (auto-compile via tsx) ──
  autoCompileOpts: {
    tsNodeOpts: {
-      project: resolve(__dirname, 'tsconfig.json'),
+      project: resolve(projectRoot, 'tests/e2e/tsconfig.json'),
    },
  },
 };