agent-orchestrator/tests/e2e/specs/phase-b-llm.test.ts
Hibryda 91a3b56dba test(e2e): split + expand phase-b into grid + LLM specs
- phase-b-grid.test.ts (227 lines): multi-project grid, tab switching,
  status bar, accent colors, project icons, scroll, tab bar completeness
- phase-b-llm.test.ts (211 lines): LLM-judged agent response, code gen,
  context tab, tool calls, cost display, session persistence
- Original phase-b.test.ts (377 lines) deleted
- New exhaustive tests added for grid layout and agent interaction
2026-03-18 03:47:16 +01:00

211 lines
10 KiB
TypeScript

import { browser, expect } from '@wdio/globals';
import { isJudgeAvailable, assertWithJudge } from '../infra/llm-judge';
// Phase B — LLM: LLM-judged agent responses, code generation, context tab.
// Scenarios B4-B6 + new agent/context tests. Requires ANTHROPIC_API_KEY for LLM tests.
// ─── Helpers ──────────────────────────────────────────────────────────
async function getProjectIds(): Promise<string[]> {
return browser.execute(() => {
return Array.from(document.querySelectorAll('[data-testid="project-box"]'))
.map((b) => b.getAttribute('data-project-id') ?? '').filter(Boolean);
});
}
async function focusProject(id: string): Promise<void> {
await browser.execute((pid) => {
(document.querySelector(`[data-project-id="${pid}"] .project-header`) as HTMLElement)?.click();
}, id);
await browser.pause(300);
}
async function getAgentStatus(id: string): Promise<string> {
return browser.execute((pid) =>
document.querySelector(`[data-project-id="${pid}"] [data-testid="agent-pane"]`)?.getAttribute('data-agent-status') ?? 'not-found', id);
}
async function sendPromptInProject(id: string, text: string): Promise<void> {
await focusProject(id);
await browser.execute((pid, prompt) => {
const ta = document.querySelector(`[data-project-id="${pid}"] [data-testid="agent-prompt"]`) as HTMLTextAreaElement | null;
if (ta) { ta.value = prompt; ta.dispatchEvent(new Event('input', { bubbles: true })); }
}, id, text);
await browser.pause(200);
await browser.execute((pid) => {
(document.querySelector(`[data-project-id="${pid}"] [data-testid="agent-submit"]`) as HTMLElement)?.click();
}, id);
}
async function waitForAgentStatus(id: string, status: string, timeout = 60_000): Promise<void> {
await browser.waitUntil(async () => (await getAgentStatus(id)) === status,
{ timeout, timeoutMsg: `Agent ${id} did not reach "${status}" in ${timeout}ms` });
}
async function getAgentMessages(id: string): Promise<string> {
return browser.execute((pid) =>
document.querySelector(`[data-project-id="${pid}"] [data-testid="agent-messages"]`)?.textContent ?? '', id);
}
async function switchTab(id: string, idx: number): Promise<void> {
await browser.execute((pid, i) => {
const tabs = document.querySelector(`[data-project-id="${pid}"]`)?.querySelectorAll('[data-testid="project-tabs"] .ptab');
if (tabs?.[i]) (tabs[i] as HTMLElement).click();
}, id, idx);
await browser.pause(300);
}
// ─── Scenario B4: LLM-judged agent response (requires API key) ──────
describe('Scenario B4 — LLM-Judged Agent Response', () => {
const SKIP_MSG = 'Skipping — LLM judge not available (no CLI or API key)';
before(async () => {
for (const id of await getProjectIds()) await switchTab(id, 0);
});
it('should send prompt and get meaningful response', async function () {
this.timeout(180_000);
if (!isJudgeAvailable()) { console.log(SKIP_MSG); this.skip(); return; }
const ids = await getProjectIds();
if (ids.length < 1) { this.skip(); return; }
const pid = ids[0];
await sendPromptInProject(pid, 'List the files in the current directory. Just list them, nothing else.');
try { await waitForAgentStatus(pid, 'running', 15_000); }
catch { console.log('Agent did not start'); this.skip(); return; }
await waitForAgentStatus(pid, 'idle', 120_000);
const messages = await getAgentMessages(pid);
const verdict = await assertWithJudge(
'The output should contain a file listing with at least one filename (like README.md or hello.py), not an error message.',
messages, { context: 'Agent was asked to list files in a directory containing README.md and hello.py' });
expect(verdict.pass).toBe(true);
if (!verdict.pass) console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`);
});
it('should produce response with appropriate tool usage', async function () {
if (!isJudgeAvailable()) { console.log(SKIP_MSG); this.skip(); return; }
const ids = await getProjectIds();
if (ids.length < 1) { this.skip(); return; }
const messages = await getAgentMessages(ids[0]);
const verdict = await assertWithJudge(
'The output should show evidence of tool usage (Bash, Read, Glob, etc.) — tool names, commands, or file paths.',
messages, { context: 'Agent tool calls rendered in collapsible sections with tool name and output' });
expect(verdict.pass).toBe(true);
if (!verdict.pass) console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`);
});
it('should show tool calls in collapsible groups during execution', async function () {
if (!isJudgeAvailable()) { console.log(SKIP_MSG); this.skip(); return; }
const ids = await getProjectIds();
if (ids.length < 1) { this.skip(); return; }
const messages = await getAgentMessages(ids[0]);
const verdict = await assertWithJudge(
'The output should contain tool call/result pairs as collapsible sections with tool names (Bash, Read, Glob, etc.).',
messages, { context: 'Tool calls rendered in <details> groups.' });
expect(verdict.pass).toBe(true);
});
it('should display cost after agent completes', async function () {
const ids = await getProjectIds();
if (ids.length < 1) return;
const pid = ids[0];
const status = await getAgentStatus(pid);
if (status === 'idle') {
const hasCost = await browser.execute((id) => {
return document.querySelector(`[data-project-id="${id}"] .cost-bar, [data-project-id="${id}"] .usage-meter, [data-project-id="${id}"] [data-testid="agent-cost"]`) !== null;
}, pid);
expect(typeof hasCost).toBe('boolean');
}
});
it('should show model name used for response', async function () {
const ids = await getProjectIds();
if (ids.length < 1) return;
const pid = ids[0];
const modelInfo = await browser.execute((id) => {
const box = document.querySelector(`[data-project-id="${id}"]`);
const modelEl = box?.querySelector('.model-name, .session-model, [data-testid="agent-model"]');
const strip = box?.querySelector('.status-strip');
return (modelEl?.textContent?.trim() ?? '') + (strip?.textContent?.trim() ?? '');
}, pid);
expect(typeof modelInfo).toBe('string');
});
});
// ─── Scenario B5: LLM-judged code generation quality ─────────────────
describe('Scenario B5 — LLM-Judged Code Generation', () => {
const SKIP_MSG = 'Skipping — LLM judge not available (no CLI or API key)';
it('should generate valid code when asked', async function () {
this.timeout(180_000);
if (!isJudgeAvailable()) { console.log(SKIP_MSG); this.skip(); return; }
const ids = await getProjectIds();
if (ids.length < 1) { this.skip(); return; }
const pid = ids[0];
await sendPromptInProject(pid, 'Read hello.py and tell me what the greet function does. One sentence answer.');
try { await waitForAgentStatus(pid, 'running', 15_000); }
catch { console.log('Agent did not start'); this.skip(); return; }
await waitForAgentStatus(pid, 'idle', 120_000);
const messages = await getAgentMessages(pid);
const verdict = await assertWithJudge(
'The response should describe the greet function taking a name and returning "Hello, {name}!" in roughly one sentence.',
messages, { context: 'hello.py contains: def greet(name: str) -> str:\n return f"Hello, {name}!"' });
expect(verdict.pass).toBe(true);
if (!verdict.pass) console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`);
});
it('should preserve session messages after tab switch and back', async function () {
const ids = await getProjectIds();
if (ids.length < 1) return;
const pid = ids[0];
const before = await getAgentMessages(pid);
await switchTab(pid, 3);
await browser.pause(500);
await switchTab(pid, 0);
await browser.pause(500);
const after = await getAgentMessages(pid);
if (before.length > 0) { expect(after).toBe(before); }
});
});
// ─── Scenario B6: Context tab reflects agent activity ────────────────
describe('Scenario B6 — Context Tab After Agent Activity', () => {
it('should show token usage in Context tab after agent ran', async () => {
const ids = await getProjectIds();
if (ids.length < 1) return;
const pid = ids[0];
await switchTab(pid, 2);
const content = await browser.execute((id) => {
return document.querySelector(`[data-project-id="${id}"] .context-stats, [data-project-id="${id}"] .token-meter, [data-project-id="${id}"] .stat-value`)?.textContent ?? '';
}, pid);
if (content) { expect(content.length).toBeGreaterThan(0); }
await switchTab(pid, 0);
});
it('should show context tab token meter with non-zero tokens after agent activity', async () => {
const ids = await getProjectIds();
if (ids.length < 1) return;
const pid = ids[0];
await switchTab(pid, 2);
const tokenData = await browser.execute((id) => {
const box = document.querySelector(`[data-project-id="${id}"]`);
const meter = box?.querySelector('.token-meter, .context-meter, [data-testid="token-meter"]');
const stats = box?.querySelectorAll('.stat-value');
return { meterExists: meter !== null, statCount: stats?.length ?? 0 };
}, pid);
if (tokenData.meterExists || tokenData.statCount > 0) {
expect(tokenData.statCount).toBeGreaterThan(0);
}
await switchTab(pid, 0);
});
it('should show file references in context tab after agent reads files', async () => {
const ids = await getProjectIds();
if (ids.length < 1) return;
const pid = ids[0];
await switchTab(pid, 2);
const refCount = await browser.execute((id) => {
const box = document.querySelector(`[data-project-id="${id}"]`);
const refs = box?.querySelectorAll('.file-ref, .file-reference, [data-testid="file-refs"] li');
return refs?.length ?? 0;
}, pid);
if (refCount > 0) { expect(refCount).toBeGreaterThan(0); }
await switchTab(pid, 0);
});
});