feat(e2e): add Phase B scenarios with LLM-judged assertions and multi-project tests

Adds 6 new E2E scenarios in phase-b.test.ts covering multi-project grid
rendering, independent tab switching, status bar fleet state, and
LLM-judged agent response quality evaluation via Claude API.
Includes llm-judge.ts helper (raw Anthropic API fetch, haiku-4-5,
structured verdicts with confidence thresholds).
This commit is contained in:
Hibryda 2026-03-12 03:07:38 +01:00
parent c4c673a4b0
commit 5e4357e4ac
3 changed files with 469 additions and 0 deletions

113
v2/tests/e2e/llm-judge.ts Normal file
View file

@ -0,0 +1,113 @@
// LLM Judge — evaluates test outcomes via Claude API
// Uses raw fetch (no SDK dep). Requires ANTHROPIC_API_KEY env var.
// Skips gracefully when API key is absent.
const API_URL = 'https://api.anthropic.com/v1/messages';
const MODEL = 'claude-haiku-4-5-20251001'; // Fast + cheap for test judging
const MAX_TOKENS = 512;
export interface JudgeVerdict {
pass: boolean;
reasoning: string;
confidence: number; // 0-1
}
/**
* Check if the LLM judge is available (API key set).
*/
export function isJudgeAvailable(): boolean {
return !!process.env.ANTHROPIC_API_KEY;
}
/**
* Ask Claude to evaluate whether `actual` output satisfies `criteria`.
*
* Returns a structured verdict with pass/fail, reasoning, and confidence.
* Throws if API call fails (caller should catch and handle).
*/
export async function judge(
criteria: string,
actual: string,
context?: string,
): Promise<JudgeVerdict> {
const apiKey = process.env.ANTHROPIC_API_KEY;
if (!apiKey) {
throw new Error('ANTHROPIC_API_KEY not set — LLM judge unavailable');
}
const systemPrompt = `You are a test assertion judge for a terminal emulator application called BTerminal.
Your job is to evaluate whether actual output from the application meets the given criteria.
Respond with EXACTLY this JSON format, nothing else:
{"pass": true/false, "reasoning": "brief explanation", "confidence": 0.0-1.0}`;
const userPrompt = [
'## Criteria',
criteria,
'',
'## Actual Output',
actual,
...(context ? ['', '## Additional Context', context] : []),
'',
'Does the actual output satisfy the criteria? Respond with JSON only.',
].join('\n');
const response = await fetch(API_URL, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'x-api-key': apiKey,
'anthropic-version': '2023-06-01',
},
body: JSON.stringify({
model: MODEL,
max_tokens: MAX_TOKENS,
system: systemPrompt,
messages: [{ role: 'user', content: userPrompt }],
}),
});
if (!response.ok) {
const body = await response.text();
throw new Error(`Anthropic API error ${response.status}: ${body}`);
}
const data = await response.json();
const text = data.content?.[0]?.text ?? '';
// Extract JSON from response (may have markdown fences)
const jsonMatch = text.match(/\{[\s\S]*\}/);
if (!jsonMatch) {
throw new Error(`LLM judge returned non-JSON: ${text}`);
}
const verdict = JSON.parse(jsonMatch[0]) as JudgeVerdict;
// Validate structure
if (typeof verdict.pass !== 'boolean') {
throw new Error(`LLM judge returned invalid verdict: ${text}`);
}
verdict.confidence = Number(verdict.confidence) || 0;
verdict.reasoning = String(verdict.reasoning || '');
return verdict;
}
/**
* Convenience: judge with a minimum confidence threshold.
* Returns true only if pass=true AND confidence >= threshold.
*/
export async function assertWithJudge(
criteria: string,
actual: string,
options: { context?: string; minConfidence?: number } = {},
): Promise<JudgeVerdict> {
const { context, minConfidence = 0.7 } = options;
const verdict = await judge(criteria, actual, context);
if (verdict.pass && verdict.confidence < minConfidence) {
verdict.pass = false;
verdict.reasoning += ` (confidence ${verdict.confidence} below threshold ${minConfidence})`;
}
return verdict;
}

View file

@ -0,0 +1,355 @@
import { browser, expect } from '@wdio/globals';
import { isJudgeAvailable, assertWithJudge } from '../llm-judge';
// Phase B: Multi-project scenarios + LLM-judged assertions.
// Extends Phase A with tests that exercise multiple project boxes simultaneously
// and use Claude API to evaluate agent response quality.
//
// Prerequisites:
// - Built debug binary (or SKIP_BUILD=1)
// - groups.json with 2+ projects (use BTERMINAL_TEST_CONFIG_DIR or default)
// - ANTHROPIC_API_KEY env var for LLM-judged tests (skipped if absent)
// ─── Helpers ──────────────────────────────────────────────────────────
/** Get all project box IDs currently rendered. */
async function getProjectIds(): Promise<string[]> {
return browser.execute(() => {
const boxes = document.querySelectorAll('[data-testid="project-box"]');
return Array.from(boxes).map(
(b) => b.getAttribute('data-project-id') ?? '',
).filter(Boolean);
});
}
/** Focus a specific project box by its project ID. */
async function focusProject(projectId: string): Promise<void> {
await browser.execute((id) => {
const box = document.querySelector(`[data-project-id="${id}"]`);
const header = box?.querySelector('.project-header');
if (header) (header as HTMLElement).click();
}, projectId);
await browser.pause(300);
}
/** Get the agent status for a specific project box. */
async function getAgentStatus(projectId: string): Promise<string> {
return browser.execute((id) => {
const box = document.querySelector(`[data-project-id="${id}"]`);
const pane = box?.querySelector('[data-testid="agent-pane"]');
return pane?.getAttribute('data-agent-status') ?? 'not-found';
}, projectId);
}
/** Send a prompt to the agent in a specific project box. */
async function sendPromptInProject(projectId: string, text: string): Promise<void> {
await focusProject(projectId);
await browser.execute((id, prompt) => {
const box = document.querySelector(`[data-project-id="${id}"]`);
const textarea = box?.querySelector('[data-testid="agent-prompt"]') as HTMLTextAreaElement | null;
if (textarea) {
textarea.value = prompt;
textarea.dispatchEvent(new Event('input', { bubbles: true }));
}
}, projectId, text);
await browser.pause(200);
await browser.execute((id) => {
const box = document.querySelector(`[data-project-id="${id}"]`);
const btn = box?.querySelector('[data-testid="agent-submit"]') as HTMLElement | null;
if (btn) btn.click();
}, projectId);
}
/** Wait for agent in a specific project to reach target status. */
async function waitForProjectAgentStatus(
projectId: string,
status: string,
timeout = 60_000,
): Promise<void> {
await browser.waitUntil(
async () => (await getAgentStatus(projectId)) === status,
{ timeout, timeoutMsg: `Agent in project ${projectId} did not reach "${status}" within ${timeout}ms` },
);
}
/** Get all message text from an agent pane in a specific project. */
async function getAgentMessages(projectId: string): Promise<string> {
return browser.execute((id) => {
const box = document.querySelector(`[data-project-id="${id}"]`);
const area = box?.querySelector('[data-testid="agent-messages"]');
return area?.textContent ?? '';
}, projectId);
}
/** Switch to a tab in a specific project box. Tab index: 0=Model, 1=Docs, 2=Context, etc. */
async function switchProjectTab(projectId: string, tabIndex: number): Promise<void> {
await browser.execute((id, idx) => {
const box = document.querySelector(`[data-project-id="${id}"]`);
const tabs = box?.querySelectorAll('[data-testid="project-tabs"] .ptab');
if (tabs && tabs[idx]) (tabs[idx] as HTMLElement).click();
}, projectId, tabIndex);
await browser.pause(300);
}
// ─── Scenario B1: Multi-project grid renders correctly ────────────────
describe('Scenario B1 — Multi-Project Grid', () => {
it('should render multiple project boxes', async () => {
const ids = await getProjectIds();
// May be 1 project in minimal fixture; test structure regardless
expect(ids.length).toBeGreaterThanOrEqual(1);
// Each ID should be unique
const unique = new Set(ids);
expect(unique.size).toBe(ids.length);
});
it('should show project headers with CWD paths', async () => {
const headers = await browser.execute(() => {
const els = document.querySelectorAll('.project-header .cwd');
return Array.from(els).map((e) => e.textContent?.trim() ?? '');
});
// Each header should have a non-empty CWD
for (const cwd of headers) {
expect(cwd.length).toBeGreaterThan(0);
}
});
it('should have independent agent panes per project', async () => {
const ids = await getProjectIds();
for (const id of ids) {
const status = await getAgentStatus(id);
expect(['idle', 'running', 'stalled']).toContain(status);
}
});
it('should focus project on click and show active styling', async () => {
const ids = await getProjectIds();
if (ids.length < 1) return;
await focusProject(ids[0]);
const isActive = await browser.execute((id) => {
const box = document.querySelector(`[data-project-id="${id}"]`);
return box?.classList.contains('active') ?? false;
}, ids[0]);
expect(isActive).toBe(true);
});
});
// ─── Scenario B2: Independent tab switching across projects ───────────
describe('Scenario B2 — Independent Tab Switching', () => {
it('should allow different tabs active in different projects', async () => {
const ids = await getProjectIds();
if (ids.length < 2) {
console.log('Skipping B2 — need 2+ projects');
return;
}
// Switch first project to Files tab (index 3)
await switchProjectTab(ids[0], 3);
// Keep second project on Model tab (index 0)
await switchProjectTab(ids[1], 0);
// Verify first project has Files tab active
const firstActiveTab = await browser.execute((id) => {
const box = document.querySelector(`[data-project-id="${id}"]`);
const active = box?.querySelector('[data-testid="project-tabs"] .ptab.active');
return active?.textContent?.trim() ?? '';
}, ids[0]);
const secondActiveTab = await browser.execute((id) => {
const box = document.querySelector(`[data-project-id="${id}"]`);
const active = box?.querySelector('[data-testid="project-tabs"] .ptab.active');
return active?.textContent?.trim() ?? '';
}, ids[1]);
// They should be different tabs
expect(firstActiveTab).not.toBe(secondActiveTab);
// Restore first project to Model tab
await switchProjectTab(ids[0], 0);
});
});
// ─── Scenario B3: Status bar reflects fleet state ────────────────────
describe('Scenario B3 — Status Bar Fleet State', () => {
it('should show agent count in status bar', async () => {
const barText = await browser.execute(() => {
const bar = document.querySelector('[data-testid="status-bar"]');
return bar?.textContent ?? '';
});
// Status bar should contain at least one count (idle agents)
expect(barText.length).toBeGreaterThan(0);
});
it('should show $0.00 burn rate when all agents idle', async () => {
const burnRate = await browser.execute(() => {
const bar = document.querySelector('[data-testid="status-bar"]');
const burnEl = bar?.querySelector('.burn-rate, .cost');
return burnEl?.textContent ?? bar?.textContent ?? '';
});
// Should contain $0 or 0.00 when no agents running
expect(burnRate).toMatch(/\$0|0\.00/);
});
});
// ─── Scenario B4: LLM-judged agent response (requires API key) ──────
describe('Scenario B4 — LLM-Judged Agent Response', () => {
const SKIP_MSG = 'Skipping — ANTHROPIC_API_KEY not set';
it('should send prompt and get meaningful response', async function () {
if (!isJudgeAvailable()) {
console.log(SKIP_MSG);
this.skip();
return;
}
const ids = await getProjectIds();
if (ids.length < 1) {
this.skip();
return;
}
const projectId = ids[0];
// Send a prompt that requires a specific kind of response
await sendPromptInProject(projectId, 'List the files in the current directory. Just list them, nothing else.');
// Wait for agent to start
try {
await waitForProjectAgentStatus(projectId, 'running', 15_000);
} catch {
console.log('Agent did not start — Claude CLI may not be available');
this.skip();
return;
}
// Wait for completion
await waitForProjectAgentStatus(projectId, 'idle', 120_000);
// Get the agent's output
const messages = await getAgentMessages(projectId);
// Use LLM judge to evaluate the response
const verdict = await assertWithJudge(
'The output should contain a file listing that includes at least one filename (like README.md or hello.py). It should look like a directory listing, not an error message.',
messages,
{ context: 'BTerminal agent was asked to list files in a test project directory containing README.md and hello.py' },
);
expect(verdict.pass).toBe(true);
if (!verdict.pass) {
console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`);
}
});
it('should produce response with appropriate tool usage', async function () {
if (!isJudgeAvailable()) {
console.log(SKIP_MSG);
this.skip();
return;
}
const ids = await getProjectIds();
if (ids.length < 1) {
this.skip();
return;
}
const projectId = ids[0];
// Check that the previous response (from prior test) involved tool calls
const messages = await getAgentMessages(projectId);
const verdict = await assertWithJudge(
'The output should show evidence that the agent used tools (like Bash, Read, Glob, or LS commands) to list files. Tool usage typically appears as tool call names, command text, or file paths in the output.',
messages,
{ context: 'BTerminal renders agent tool calls in collapsible sections showing the tool name and output' },
);
expect(verdict.pass).toBe(true);
if (!verdict.pass) {
console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`);
}
});
});
// ─── Scenario B5: LLM-judged code generation quality ─────────────────
describe('Scenario B5 — LLM-Judged Code Generation', () => {
const SKIP_MSG = 'Skipping — ANTHROPIC_API_KEY not set';
it('should generate valid code when asked', async function () {
if (!isJudgeAvailable()) {
console.log(SKIP_MSG);
this.skip();
return;
}
const ids = await getProjectIds();
if (ids.length < 1) {
this.skip();
return;
}
const projectId = ids[0];
// Ask agent to read and explain existing code
await sendPromptInProject(
projectId,
'Read hello.py and tell me what the greet function does. One sentence answer.',
);
try {
await waitForProjectAgentStatus(projectId, 'running', 15_000);
} catch {
console.log('Agent did not start — Claude CLI may not be available');
this.skip();
return;
}
await waitForProjectAgentStatus(projectId, 'idle', 120_000);
const messages = await getAgentMessages(projectId);
const verdict = await assertWithJudge(
'The response should correctly describe that the greet function takes a name parameter and returns a greeting string like "Hello, {name}!". The explanation should be roughly one sentence as requested.',
messages,
{ context: 'hello.py contains: def greet(name: str) -> str:\n return f"Hello, {name}!"' },
);
expect(verdict.pass).toBe(true);
if (!verdict.pass) {
console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`);
}
});
});
// ─── Scenario B6: Context tab reflects agent activity ────────────────
describe('Scenario B6 — Context Tab After Agent Activity', () => {
it('should show token usage in Context tab after agent ran', async () => {
const ids = await getProjectIds();
if (ids.length < 1) return;
const projectId = ids[0];
// Switch to Context tab (index 2)
await switchProjectTab(projectId, 2);
// Check if context tab has any content
const contextContent = await browser.execute((id) => {
const box = document.querySelector(`[data-project-id="${id}"]`);
// Look for stats or token meter elements
const stats = box?.querySelector('.context-stats, .token-meter, .stat-value');
return stats?.textContent ?? '';
}, projectId);
// If an agent has run, context tab should have data
// If no agent ran (skipped), this may be empty — that's OK
if (contextContent) {
expect(contextContent.length).toBeGreaterThan(0);
}
// Switch back to Model tab
await switchProjectTab(projectId, 0);
});
});

View file

@ -28,6 +28,7 @@ export const config = {
specs: [
resolve(__dirname, 'specs/bterminal.test.ts'),
resolve(__dirname, 'specs/agent-scenarios.test.ts'),
resolve(__dirname, 'specs/phase-b.test.ts'),
],
// ── Capabilities ──