refactor(e2e): extract infrastructure into tests/e2e/infra/ module
- Move fixtures.ts, llm-judge.ts, results-db.ts to tests/e2e/infra/ - Deduplicate wdio.conf.js: use createTestFixture() instead of inline copy - Replace __dirname paths with projectRoot-anchored paths - Create test-mode-constants.ts (typed env var names, flag registry) - Create scripts/preflight-check.sh (validates tauri-driver, display, Claude CLI) - Create scripts/check-test-flags.sh (CI lint for AGOR_TEST flag drift) - Rewrite tests/e2e/README.md with full documentation - Update spec imports for moved infra files
This commit is contained in:
parent
538a31f85c
commit
e76bc341f2
10 changed files with 235 additions and 191 deletions
142
tests/e2e/infra/fixtures.ts
Normal file
142
tests/e2e/infra/fixtures.ts
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
// Test fixture generator — creates isolated test environments
|
||||
// Used by E2E tests to set up temp data/config dirs with valid groups.json
|
||||
|
||||
import { mkdirSync, writeFileSync, rmSync, existsSync } from 'node:fs';
|
||||
import { join } from 'node:path';
|
||||
import { execSync } from 'node:child_process';
|
||||
import { tmpdir } from 'node:os';
|
||||
|
||||
export interface TestFixture {
|
||||
/** Root temp directory for this test run */
|
||||
rootDir: string;
|
||||
/** AGOR_TEST_DATA_DIR — isolated data dir */
|
||||
dataDir: string;
|
||||
/** AGOR_TEST_CONFIG_DIR — isolated config dir */
|
||||
configDir: string;
|
||||
/** Path to a minimal git repo for agent testing */
|
||||
projectDir: string;
|
||||
/** Environment variables to pass to the app */
|
||||
env: Record<string, string>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an isolated test fixture with:
|
||||
* - Temp data dir (sessions.db, btmsg.db created at runtime)
|
||||
* - Temp config dir with a minimal groups.json
|
||||
* - A simple git repo with one file for agent testing
|
||||
*/
|
||||
export function createTestFixture(name = 'agor-e2e'): TestFixture {
|
||||
const rootDir = join(tmpdir(), `${name}-${Date.now()}`);
|
||||
const dataDir = join(rootDir, 'data');
|
||||
const configDir = join(rootDir, 'config');
|
||||
const projectDir = join(rootDir, 'test-project');
|
||||
|
||||
// Create directory structure
|
||||
mkdirSync(dataDir, { recursive: true });
|
||||
mkdirSync(configDir, { recursive: true });
|
||||
mkdirSync(projectDir, { recursive: true });
|
||||
|
||||
// Create a minimal git repo for agent testing
|
||||
execSync('git init', { cwd: projectDir, stdio: 'ignore' });
|
||||
execSync('git config user.email "test@agor.dev"', { cwd: projectDir, stdio: 'ignore' });
|
||||
execSync('git config user.name "Agor Test"', { cwd: projectDir, stdio: 'ignore' });
|
||||
writeFileSync(join(projectDir, 'README.md'), '# Test Project\n\nA simple test project for Agor E2E tests.\n');
|
||||
writeFileSync(join(projectDir, 'hello.py'), 'def greet(name: str) -> str:\n return f"Hello, {name}!"\n');
|
||||
execSync('git add -A && git commit -m "initial commit"', { cwd: projectDir, stdio: 'ignore' });
|
||||
|
||||
// Write groups.json with one group containing the test project
|
||||
const groupsJson = {
|
||||
version: 1,
|
||||
groups: [
|
||||
{
|
||||
id: 'test-group',
|
||||
name: 'Test Group',
|
||||
projects: [
|
||||
{
|
||||
id: 'test-project',
|
||||
name: 'Test Project',
|
||||
identifier: 'test-project',
|
||||
description: 'E2E test project',
|
||||
icon: '\uf120',
|
||||
cwd: projectDir,
|
||||
profile: 'default',
|
||||
enabled: true,
|
||||
},
|
||||
],
|
||||
agents: [],
|
||||
},
|
||||
],
|
||||
activeGroupId: 'test-group',
|
||||
};
|
||||
|
||||
writeFileSync(
|
||||
join(configDir, 'groups.json'),
|
||||
JSON.stringify(groupsJson, null, 2),
|
||||
);
|
||||
|
||||
const env: Record<string, string> = {
|
||||
AGOR_TEST: '1',
|
||||
AGOR_TEST_DATA_DIR: dataDir,
|
||||
AGOR_TEST_CONFIG_DIR: configDir,
|
||||
};
|
||||
|
||||
return { rootDir, dataDir, configDir, projectDir, env };
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up a test fixture's temporary directories.
|
||||
*/
|
||||
export function destroyTestFixture(fixture: TestFixture): void {
|
||||
if (existsSync(fixture.rootDir)) {
|
||||
rmSync(fixture.rootDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a groups.json with multiple projects for multi-project testing.
|
||||
*/
|
||||
export function createMultiProjectFixture(projectCount = 3): TestFixture {
|
||||
const fixture = createTestFixture('agor-multi');
|
||||
|
||||
const projects = [];
|
||||
for (let i = 0; i < projectCount; i++) {
|
||||
const projDir = join(fixture.rootDir, `project-${i}`);
|
||||
mkdirSync(projDir, { recursive: true });
|
||||
execSync('git init', { cwd: projDir, stdio: 'ignore' });
|
||||
execSync('git config user.email "test@agor.dev"', { cwd: projDir, stdio: 'ignore' });
|
||||
execSync('git config user.name "Agor Test"', { cwd: projDir, stdio: 'ignore' });
|
||||
writeFileSync(join(projDir, 'README.md'), `# Project ${i}\n`);
|
||||
execSync('git add -A && git commit -m "init"', { cwd: projDir, stdio: 'ignore' });
|
||||
|
||||
projects.push({
|
||||
id: `project-${i}`,
|
||||
name: `Project ${i}`,
|
||||
identifier: `project-${i}`,
|
||||
description: `Test project ${i}`,
|
||||
icon: '\uf120',
|
||||
cwd: projDir,
|
||||
profile: 'default',
|
||||
enabled: true,
|
||||
});
|
||||
}
|
||||
|
||||
const groupsJson = {
|
||||
version: 1,
|
||||
groups: [
|
||||
{
|
||||
id: 'multi-group',
|
||||
name: 'Multi Project Group',
|
||||
projects,
|
||||
agents: [],
|
||||
},
|
||||
],
|
||||
activeGroupId: 'multi-group',
|
||||
};
|
||||
|
||||
writeFileSync(
|
||||
join(fixture.configDir, 'groups.json'),
|
||||
JSON.stringify(groupsJson, null, 2),
|
||||
);
|
||||
|
||||
return fixture;
|
||||
}
|
||||
231
tests/e2e/infra/llm-judge.ts
Normal file
231
tests/e2e/infra/llm-judge.ts
Normal file
|
|
@ -0,0 +1,231 @@
|
|||
// LLM Judge — evaluates test outcomes via Claude.
|
||||
//
|
||||
// Two backends, configurable via LLM_JUDGE_BACKEND env var:
|
||||
// "cli" — Claude CLI (default, no API key needed)
|
||||
// "api" — Anthropic REST API (requires ANTHROPIC_API_KEY)
|
||||
//
|
||||
// CLI backend: spawns `claude` with --output-format text, parses JSON verdict.
|
||||
// API backend: raw fetch to messages API, same JSON verdict parsing.
|
||||
//
|
||||
// Skips gracefully when neither backend is available.
|
||||
|
||||
import { execFileSync, execSync } from 'node:child_process';
|
||||
import { existsSync } from 'node:fs';
|
||||
|
||||
const MODEL = 'claude-haiku-4-5-20251001';
|
||||
const API_URL = 'https://api.anthropic.com/v1/messages';
|
||||
const MAX_TOKENS = 512;
|
||||
|
||||
// CLI search paths (in order)
|
||||
const CLI_PATHS = [
|
||||
`${process.env.HOME}/.local/bin/claude`,
|
||||
`${process.env.HOME}/.claude/local/claude`,
|
||||
'/usr/local/bin/claude',
|
||||
'/usr/bin/claude',
|
||||
];
|
||||
|
||||
export type JudgeBackend = 'cli' | 'api';
|
||||
|
||||
export interface JudgeVerdict {
|
||||
pass: boolean;
|
||||
reasoning: string;
|
||||
confidence: number; // 0-1
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the Claude CLI binary path, or null if not installed.
|
||||
*/
|
||||
function findClaudeCli(): string | null {
|
||||
for (const p of CLI_PATHS) {
|
||||
if (existsSync(p)) return p;
|
||||
}
|
||||
// Fallback: check PATH
|
||||
try {
|
||||
const which = execSync('which claude 2>/dev/null', { encoding: 'utf-8' }).trim();
|
||||
if (which) return which;
|
||||
} catch {
|
||||
// not found
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine which backend to use.
|
||||
* Env var LLM_JUDGE_BACKEND overrides auto-detection.
|
||||
* Auto: CLI if available, then API if key set, else null.
|
||||
*/
|
||||
function resolveBackend(): JudgeBackend | null {
|
||||
const explicit = process.env.LLM_JUDGE_BACKEND?.toLowerCase();
|
||||
if (explicit === 'cli') return findClaudeCli() ? 'cli' : null;
|
||||
if (explicit === 'api') return process.env.ANTHROPIC_API_KEY ? 'api' : null;
|
||||
|
||||
// Auto-detect: CLI first, API fallback
|
||||
if (findClaudeCli()) return 'cli';
|
||||
if (process.env.ANTHROPIC_API_KEY) return 'api';
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the LLM judge is available (CLI installed or API key set).
|
||||
*/
|
||||
export function isJudgeAvailable(): boolean {
|
||||
return resolveBackend() !== null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the prompt for the judge.
|
||||
*/
|
||||
function buildPrompt(criteria: string, actual: string, context?: string): { system: string; user: string } {
|
||||
const system = `You are a test assertion judge for a terminal emulator application called BTerminal.
|
||||
Your job is to evaluate whether actual output from the application meets the given criteria.
|
||||
Respond with EXACTLY this JSON format, nothing else:
|
||||
{"pass": true/false, "reasoning": "brief explanation", "confidence": 0.0-1.0}`;
|
||||
|
||||
const user = [
|
||||
'## Criteria',
|
||||
criteria,
|
||||
'',
|
||||
'## Actual Output',
|
||||
actual,
|
||||
...(context ? ['', '## Additional Context', context] : []),
|
||||
'',
|
||||
'Does the actual output satisfy the criteria? Respond with JSON only.',
|
||||
].join('\n');
|
||||
|
||||
return { system, user };
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract and validate a JudgeVerdict from raw text output.
|
||||
*/
|
||||
function parseVerdict(text: string): JudgeVerdict {
|
||||
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
||||
if (!jsonMatch) {
|
||||
throw new Error(`LLM judge returned non-JSON: ${text}`);
|
||||
}
|
||||
|
||||
const verdict = JSON.parse(jsonMatch[0]) as JudgeVerdict;
|
||||
|
||||
if (typeof verdict.pass !== 'boolean') {
|
||||
throw new Error(`LLM judge returned invalid verdict: ${text}`);
|
||||
}
|
||||
verdict.confidence = Number(verdict.confidence) || 0;
|
||||
verdict.reasoning = String(verdict.reasoning || '');
|
||||
|
||||
return verdict;
|
||||
}
|
||||
|
||||
/**
|
||||
* Judge via Claude CLI (spawns subprocess).
|
||||
* Unsets CLAUDECODE to avoid nested session errors.
|
||||
*/
|
||||
async function judgeCli(
|
||||
criteria: string,
|
||||
actual: string,
|
||||
context?: string,
|
||||
): Promise<JudgeVerdict> {
|
||||
const cliPath = findClaudeCli();
|
||||
if (!cliPath) throw new Error('Claude CLI not found');
|
||||
|
||||
const { system, user } = buildPrompt(criteria, actual, context);
|
||||
|
||||
const output = execFileSync(cliPath, [
|
||||
'-p', user,
|
||||
'--model', MODEL,
|
||||
'--output-format', 'text',
|
||||
'--system-prompt', system,
|
||||
'--setting-sources', 'user', // skip project CLAUDE.md
|
||||
], {
|
||||
encoding: 'utf-8',
|
||||
timeout: 60_000,
|
||||
cwd: '/tmp', // avoid loading project CLAUDE.md
|
||||
env: { ...process.env, CLAUDECODE: '' },
|
||||
maxBuffer: 1024 * 1024,
|
||||
});
|
||||
|
||||
return parseVerdict(output);
|
||||
}
|
||||
|
||||
/**
|
||||
* Judge via Anthropic REST API (raw fetch).
|
||||
*/
|
||||
async function judgeApi(
|
||||
criteria: string,
|
||||
actual: string,
|
||||
context?: string,
|
||||
): Promise<JudgeVerdict> {
|
||||
const apiKey = process.env.ANTHROPIC_API_KEY;
|
||||
if (!apiKey) throw new Error('ANTHROPIC_API_KEY not set');
|
||||
|
||||
const { system, user } = buildPrompt(criteria, actual, context);
|
||||
|
||||
const response = await fetch(API_URL, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'x-api-key': apiKey,
|
||||
'anthropic-version': '2023-06-01',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: MODEL,
|
||||
max_tokens: MAX_TOKENS,
|
||||
system,
|
||||
messages: [{ role: 'user', content: user }],
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const body = await response.text();
|
||||
throw new Error(`Anthropic API error ${response.status}: ${body}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const text = data.content?.[0]?.text ?? '';
|
||||
|
||||
return parseVerdict(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Ask Claude to evaluate whether `actual` output satisfies `criteria`.
|
||||
*
|
||||
* Uses CLI backend by default, falls back to API. Override with
|
||||
* LLM_JUDGE_BACKEND env var ("cli" or "api").
|
||||
*
|
||||
* Returns a structured verdict with pass/fail, reasoning, and confidence.
|
||||
* Throws if no backend available or call fails.
|
||||
*/
|
||||
export async function judge(
|
||||
criteria: string,
|
||||
actual: string,
|
||||
context?: string,
|
||||
): Promise<JudgeVerdict> {
|
||||
const backend = resolveBackend();
|
||||
if (!backend) {
|
||||
throw new Error('LLM judge unavailable — no Claude CLI found and ANTHROPIC_API_KEY not set');
|
||||
}
|
||||
|
||||
if (backend === 'cli') {
|
||||
return judgeCli(criteria, actual, context);
|
||||
}
|
||||
return judgeApi(criteria, actual, context);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience: judge with a minimum confidence threshold.
|
||||
* Returns pass=true only if verdict.pass=true AND confidence >= threshold.
|
||||
*/
|
||||
export async function assertWithJudge(
|
||||
criteria: string,
|
||||
actual: string,
|
||||
options: { context?: string; minConfidence?: number } = {},
|
||||
): Promise<JudgeVerdict> {
|
||||
const { context, minConfidence = 0.7 } = options;
|
||||
const verdict = await judge(criteria, actual, context);
|
||||
|
||||
if (verdict.pass && verdict.confidence < minConfidence) {
|
||||
verdict.pass = false;
|
||||
verdict.reasoning += ` (confidence ${verdict.confidence} below threshold ${minConfidence})`;
|
||||
}
|
||||
|
||||
return verdict;
|
||||
}
|
||||
113
tests/e2e/infra/results-db.ts
Normal file
113
tests/e2e/infra/results-db.ts
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
// Test results store — persists test run outcomes as JSON for analysis
|
||||
// No native deps needed — reads/writes a JSON file
|
||||
|
||||
import { resolve, dirname } from 'node:path';
|
||||
import { mkdirSync, readFileSync, writeFileSync, existsSync } from 'node:fs';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const DEFAULT_PATH = resolve(__dirname, '../../test-results/results.json');
|
||||
|
||||
export interface TestRunRow {
|
||||
run_id: string;
|
||||
started_at: string;
|
||||
finished_at: string | null;
|
||||
status: 'running' | 'passed' | 'failed' | 'error';
|
||||
total_tests: number;
|
||||
passed_tests: number;
|
||||
failed_tests: number;
|
||||
duration_ms: number | null;
|
||||
git_branch: string | null;
|
||||
git_sha: string | null;
|
||||
}
|
||||
|
||||
export interface TestStepRow {
|
||||
run_id: string;
|
||||
scenario_name: string;
|
||||
step_name: string;
|
||||
status: 'passed' | 'failed' | 'skipped' | 'error';
|
||||
duration_ms: number | null;
|
||||
error_message: string | null;
|
||||
screenshot_path: string | null;
|
||||
agent_cost_usd: number | null;
|
||||
created_at: string;
|
||||
}
|
||||
|
||||
interface ResultsStore {
|
||||
runs: TestRunRow[];
|
||||
steps: TestStepRow[];
|
||||
}
|
||||
|
||||
export class ResultsDb {
|
||||
private filePath: string;
|
||||
private store: ResultsStore;
|
||||
|
||||
constructor(filePath = DEFAULT_PATH) {
|
||||
this.filePath = filePath;
|
||||
mkdirSync(dirname(filePath), { recursive: true });
|
||||
this.store = this.load();
|
||||
}
|
||||
|
||||
private load(): ResultsStore {
|
||||
if (existsSync(this.filePath)) {
|
||||
try {
|
||||
return JSON.parse(readFileSync(this.filePath, 'utf-8'));
|
||||
} catch {
|
||||
return { runs: [], steps: [] };
|
||||
}
|
||||
}
|
||||
return { runs: [], steps: [] };
|
||||
}
|
||||
|
||||
private save(): void {
|
||||
writeFileSync(this.filePath, JSON.stringify(this.store, null, 2));
|
||||
}
|
||||
|
||||
startRun(runId: string, gitBranch?: string, gitSha?: string): void {
|
||||
this.store.runs.push({
|
||||
run_id: runId,
|
||||
started_at: new Date().toISOString(),
|
||||
finished_at: null,
|
||||
status: 'running',
|
||||
total_tests: 0,
|
||||
passed_tests: 0,
|
||||
failed_tests: 0,
|
||||
duration_ms: null,
|
||||
git_branch: gitBranch ?? null,
|
||||
git_sha: gitSha ?? null,
|
||||
});
|
||||
this.save();
|
||||
}
|
||||
|
||||
finishRun(runId: string, status: 'passed' | 'failed' | 'error', durationMs: number): void {
|
||||
const run = this.store.runs.find(r => r.run_id === runId);
|
||||
if (!run) return;
|
||||
|
||||
const steps = this.store.steps.filter(s => s.run_id === runId);
|
||||
run.finished_at = new Date().toISOString();
|
||||
run.status = status;
|
||||
run.duration_ms = durationMs;
|
||||
run.total_tests = steps.length;
|
||||
run.passed_tests = steps.filter(s => s.status === 'passed').length;
|
||||
run.failed_tests = steps.filter(s => s.status === 'failed' || s.status === 'error').length;
|
||||
this.save();
|
||||
}
|
||||
|
||||
recordStep(step: Omit<TestStepRow, 'created_at'>): void {
|
||||
this.store.steps.push({
|
||||
...step,
|
||||
created_at: new Date().toISOString(),
|
||||
});
|
||||
this.save();
|
||||
}
|
||||
|
||||
getRecentRuns(limit = 20): TestRunRow[] {
|
||||
return this.store.runs
|
||||
.sort((a, b) => b.started_at.localeCompare(a.started_at))
|
||||
.slice(0, limit);
|
||||
}
|
||||
|
||||
getStepsForRun(runId: string): TestStepRow[] {
|
||||
return this.store.steps.filter(s => s.run_id === runId);
|
||||
}
|
||||
}
|
||||
23
tests/e2e/infra/test-mode-constants.ts
Normal file
23
tests/e2e/infra/test-mode-constants.ts
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
// Typed constants for test-mode environment variables.
|
||||
// Single source of truth for env var names — prevents string literal duplication.
|
||||
//
|
||||
// These env vars are read by:
|
||||
// Rust: agor-core/src/config.rs (AppConfig::from_env)
|
||||
// src-tauri/src/commands/misc.rs (is_test_mode)
|
||||
// src-tauri/src/lib.rs (setup: skip CLI install, forward to sidecar)
|
||||
// src-tauri/src/watcher.rs (disable file watcher)
|
||||
// src-tauri/src/fs_watcher.rs (disable fs watcher)
|
||||
// src-tauri/src/telemetry.rs (disable OTLP)
|
||||
// Svelte: src/App.svelte (disable wake scheduler)
|
||||
|
||||
/** Main test mode flag — set to '1' to enable test isolation */
|
||||
export const AGOR_TEST = 'AGOR_TEST';
|
||||
|
||||
/** Override data directory (sessions.db, btmsg.db, search.db) */
|
||||
export const AGOR_TEST_DATA_DIR = 'AGOR_TEST_DATA_DIR';
|
||||
|
||||
/** Override config directory (groups.json, plugins/) */
|
||||
export const AGOR_TEST_CONFIG_DIR = 'AGOR_TEST_CONFIG_DIR';
|
||||
|
||||
/** All test-mode env vars for iteration */
|
||||
export const TEST_ENV_VARS = [AGOR_TEST, AGOR_TEST_DATA_DIR, AGOR_TEST_CONFIG_DIR] as const;
|
||||
Loading…
Add table
Add a link
Reference in a new issue