import { browser, expect } from '@wdio/globals'; import { isJudgeAvailable, assertWithJudge } from '../infra/llm-judge'; // Phase B — LLM: LLM-judged agent responses, code generation, context tab. // Scenarios B4-B6 + new agent/context tests. Requires ANTHROPIC_API_KEY for LLM tests. // ─── Helpers ────────────────────────────────────────────────────────── async function getProjectIds(): Promise { return browser.execute(() => { return Array.from(document.querySelectorAll('[data-testid="project-box"]')) .map((b) => b.getAttribute('data-project-id') ?? '').filter(Boolean); }); } async function focusProject(id: string): Promise { await browser.execute((pid) => { (document.querySelector(`[data-project-id="${pid}"] .project-header`) as HTMLElement)?.click(); }, id); await browser.pause(300); } async function getAgentStatus(id: string): Promise { return browser.execute((pid) => document.querySelector(`[data-project-id="${pid}"] [data-testid="agent-pane"]`)?.getAttribute('data-agent-status') ?? 'not-found', id); } async function sendPromptInProject(id: string, text: string): Promise { await focusProject(id); await browser.execute((pid, prompt) => { const ta = document.querySelector(`[data-project-id="${pid}"] [data-testid="agent-prompt"]`) as HTMLTextAreaElement | null; if (ta) { ta.value = prompt; ta.dispatchEvent(new Event('input', { bubbles: true })); } }, id, text); await browser.pause(200); await browser.execute((pid) => { (document.querySelector(`[data-project-id="${pid}"] [data-testid="agent-submit"]`) as HTMLElement)?.click(); }, id); } async function waitForAgentStatus(id: string, status: string, timeout = 60_000): Promise { await browser.waitUntil(async () => (await getAgentStatus(id)) === status, { timeout, timeoutMsg: `Agent ${id} did not reach "${status}" in ${timeout}ms` }); } async function getAgentMessages(id: string): Promise { return browser.execute((pid) => document.querySelector(`[data-project-id="${pid}"] [data-testid="agent-messages"]`)?.textContent ?? '', id); } async function switchTab(id: string, idx: number): Promise { await browser.execute((pid, i) => { const tabs = document.querySelector(`[data-project-id="${pid}"]`)?.querySelectorAll('[data-testid="project-tabs"] .ptab'); if (tabs?.[i]) (tabs[i] as HTMLElement).click(); }, id, idx); await browser.pause(300); } // ─── Scenario B4: LLM-judged agent response (requires API key) ────── describe('Scenario B4 — LLM-Judged Agent Response', () => { const SKIP_MSG = 'Skipping — LLM judge not available (no CLI or API key)'; before(async () => { for (const id of await getProjectIds()) await switchTab(id, 0); }); it('should send prompt and get meaningful response', async function () { this.timeout(180_000); if (!isJudgeAvailable()) { console.log(SKIP_MSG); this.skip(); return; } const ids = await getProjectIds(); if (ids.length < 1) { this.skip(); return; } const pid = ids[0]; await sendPromptInProject(pid, 'List the files in the current directory. Just list them, nothing else.'); try { await waitForAgentStatus(pid, 'running', 15_000); } catch { console.log('Agent did not start'); this.skip(); return; } await waitForAgentStatus(pid, 'idle', 120_000); const messages = await getAgentMessages(pid); const verdict = await assertWithJudge( 'The output should contain a file listing with at least one filename (like README.md or hello.py), not an error message.', messages, { context: 'Agent was asked to list files in a directory containing README.md and hello.py' }); expect(verdict.pass).toBe(true); if (!verdict.pass) console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`); }); it('should produce response with appropriate tool usage', async function () { if (!isJudgeAvailable()) { console.log(SKIP_MSG); this.skip(); return; } const ids = await getProjectIds(); if (ids.length < 1) { this.skip(); return; } const messages = await getAgentMessages(ids[0]); const verdict = await assertWithJudge( 'The output should show evidence of tool usage (Bash, Read, Glob, etc.) — tool names, commands, or file paths.', messages, { context: 'Agent tool calls rendered in collapsible sections with tool name and output' }); expect(verdict.pass).toBe(true); if (!verdict.pass) console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`); }); it('should show tool calls in collapsible groups during execution', async function () { if (!isJudgeAvailable()) { console.log(SKIP_MSG); this.skip(); return; } const ids = await getProjectIds(); if (ids.length < 1) { this.skip(); return; } const messages = await getAgentMessages(ids[0]); const verdict = await assertWithJudge( 'The output should contain tool call/result pairs as collapsible sections with tool names (Bash, Read, Glob, etc.).', messages, { context: 'Tool calls rendered in

groups.' }); expect(verdict.pass).toBe(true); }); it('should display cost after agent completes', async function () { const ids = await getProjectIds(); if (ids.length < 1) return; const pid = ids[0]; const status = await getAgentStatus(pid); if (status === 'idle') { const hasCost = await browser.execute((id) => { return document.querySelector(`[data-project-id="${id}"] .cost-bar, [data-project-id="${id}"] .usage-meter, [data-project-id="${id}"] [data-testid="agent-cost"]`) !== null; }, pid); expect(typeof hasCost).toBe('boolean'); } }); it('should show model name used for response', async function () { const ids = await getProjectIds(); if (ids.length < 1) return; const pid = ids[0]; const modelInfo = await browser.execute((id) => { const box = document.querySelector(`[data-project-id="${id}"]`); const modelEl = box?.querySelector('.model-name, .session-model, [data-testid="agent-model"]'); const strip = box?.querySelector('.status-strip'); return (modelEl?.textContent?.trim() ?? '') + (strip?.textContent?.trim() ?? ''); }, pid); expect(typeof modelInfo).toBe('string'); }); }); // ─── Scenario B5: LLM-judged code generation quality ───────────────── describe('Scenario B5 — LLM-Judged Code Generation', () => { const SKIP_MSG = 'Skipping — LLM judge not available (no CLI or API key)'; it('should generate valid code when asked', async function () { this.timeout(180_000); if (!isJudgeAvailable()) { console.log(SKIP_MSG); this.skip(); return; } const ids = await getProjectIds(); if (ids.length < 1) { this.skip(); return; } const pid = ids[0]; await sendPromptInProject(pid, 'Read hello.py and tell me what the greet function does. One sentence answer.'); try { await waitForAgentStatus(pid, 'running', 15_000); } catch { console.log('Agent did not start'); this.skip(); return; } await waitForAgentStatus(pid, 'idle', 120_000); const messages = await getAgentMessages(pid); const verdict = await assertWithJudge( 'The response should describe the greet function taking a name and returning "Hello, {name}!" in roughly one sentence.', messages, { context: 'hello.py contains: def greet(name: str) -> str:\n return f"Hello, {name}!"' }); expect(verdict.pass).toBe(true); if (!verdict.pass) console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`); }); it('should preserve session messages after tab switch and back', async function () { const ids = await getProjectIds(); if (ids.length < 1) return; const pid = ids[0]; const before = await getAgentMessages(pid); await switchTab(pid, 3); await browser.pause(500); await switchTab(pid, 0); await browser.pause(500); const after = await getAgentMessages(pid); if (before.length > 0) { expect(after).toBe(before); } }); }); // ─── Scenario B6: Context tab reflects agent activity ──────────────── describe('Scenario B6 — Context Tab After Agent Activity', () => { it('should show token usage in Context tab after agent ran', async () => { const ids = await getProjectIds(); if (ids.length < 1) return; const pid = ids[0]; await switchTab(pid, 2); const content = await browser.execute((id) => { return document.querySelector(`[data-project-id="${id}"] .context-stats, [data-project-id="${id}"] .token-meter, [data-project-id="${id}"] .stat-value`)?.textContent ?? ''; }, pid); if (content) { expect(content.length).toBeGreaterThan(0); } await switchTab(pid, 0); }); it('should show context tab token meter with non-zero tokens after agent activity', async () => { const ids = await getProjectIds(); if (ids.length < 1) return; const pid = ids[0]; await switchTab(pid, 2); const tokenData = await browser.execute((id) => { const box = document.querySelector(`[data-project-id="${id}"]`); const meter = box?.querySelector('.token-meter, .context-meter, [data-testid="token-meter"]'); const stats = box?.querySelectorAll('.stat-value'); return { meterExists: meter !== null, statCount: stats?.length ?? 0 }; }, pid); if (tokenData.meterExists || tokenData.statCount > 0) { expect(tokenData.statCount).toBeGreaterThan(0); } await switchTab(pid, 0); }); it('should show file references in context tab after agent reads files', async () => { const ids = await getProjectIds(); if (ids.length < 1) return; const pid = ids[0]; await switchTab(pid, 2); const refCount = await browser.execute((id) => { const box = document.querySelector(`[data-project-id="${id}"]`); const refs = box?.querySelectorAll('.file-ref, .file-reference, [data-testid="file-refs"] li'); return refs?.length ?? 0; }, pid); if (refCount > 0) { expect(refCount).toBeGreaterThan(0); } await switchTab(pid, 0); }); });