Root cause: WebDriverIO devtools protocol wraps functions in a polyfill that puts `return` inside eval() (not a function body) → "Illegal return". Fix: exec() wrapper in helpers/execute.ts converts function args to IIFE strings before passing to browser.execute(). Works identically on both WebDriver (Tauri) and CDP/devtools (Electrobun CEF). - 35 spec files updated (browser.execute → exec) - 4 config files updated (string-form expressions) - helpers/actions.ts + assertions.ts updated - 560 vitest + 116 cargo passing
212 lines
10 KiB
TypeScript
212 lines
10 KiB
TypeScript
import { browser, expect } from '@wdio/globals';
|
|
import { isJudgeAvailable, assertWithJudge } from '../infra/llm-judge';
|
|
import { exec } from '../helpers/execute.ts';
|
|
|
|
// Phase B — LLM: LLM-judged agent responses, code generation, context tab.
|
|
// Scenarios B4-B6 + new agent/context tests. Requires ANTHROPIC_API_KEY for LLM tests.
|
|
|
|
// ─── Helpers ──────────────────────────────────────────────────────────
|
|
|
|
async function getProjectIds(): Promise<string[]> {
|
|
return exec(() => {
|
|
return Array.from(document.querySelectorAll('[data-testid="project-box"]'))
|
|
.map((b) => b.getAttribute('data-project-id') ?? '').filter(Boolean);
|
|
});
|
|
}
|
|
async function focusProject(id: string): Promise<void> {
|
|
await exec((pid) => {
|
|
(document.querySelector(`[data-project-id="${pid}"] .project-header`) as HTMLElement)?.click();
|
|
}, id);
|
|
await browser.pause(300);
|
|
}
|
|
async function getAgentStatus(id: string): Promise<string> {
|
|
return exec((pid) =>
|
|
document.querySelector(`[data-project-id="${pid}"] [data-testid="agent-pane"]`)?.getAttribute('data-agent-status') ?? 'not-found', id);
|
|
}
|
|
async function sendPromptInProject(id: string, text: string): Promise<void> {
|
|
await focusProject(id);
|
|
await exec((pid, prompt) => {
|
|
const ta = document.querySelector(`[data-project-id="${pid}"] [data-testid="agent-prompt"]`) as HTMLTextAreaElement | null;
|
|
if (ta) { ta.value = prompt; ta.dispatchEvent(new Event('input', { bubbles: true })); }
|
|
}, id, text);
|
|
await browser.pause(200);
|
|
await exec((pid) => {
|
|
(document.querySelector(`[data-project-id="${pid}"] [data-testid="agent-submit"]`) as HTMLElement)?.click();
|
|
}, id);
|
|
}
|
|
async function waitForAgentStatus(id: string, status: string, timeout = 60_000): Promise<void> {
|
|
await browser.waitUntil(async () => (await getAgentStatus(id)) === status,
|
|
{ timeout, timeoutMsg: `Agent ${id} did not reach "${status}" in ${timeout}ms` });
|
|
}
|
|
async function getAgentMessages(id: string): Promise<string> {
|
|
return exec((pid) =>
|
|
document.querySelector(`[data-project-id="${pid}"] [data-testid="agent-messages"]`)?.textContent ?? '', id);
|
|
}
|
|
async function switchTab(id: string, idx: number): Promise<void> {
|
|
await exec((pid, i) => {
|
|
const tabs = document.querySelector(`[data-project-id="${pid}"]`)?.querySelectorAll('[data-testid="project-tabs"] .ptab');
|
|
if (tabs?.[i]) (tabs[i] as HTMLElement).click();
|
|
}, id, idx);
|
|
await browser.pause(300);
|
|
}
|
|
|
|
// ─── Scenario B4: LLM-judged agent response (requires API key) ──────
|
|
|
|
describe('Scenario B4 — LLM-Judged Agent Response', () => {
|
|
const SKIP_MSG = 'Skipping — LLM judge not available (no CLI or API key)';
|
|
|
|
before(async () => {
|
|
for (const id of await getProjectIds()) await switchTab(id, 0);
|
|
});
|
|
|
|
it('should send prompt and get meaningful response', async function () {
|
|
this.timeout(180_000);
|
|
if (!isJudgeAvailable()) { console.log(SKIP_MSG); this.skip(); return; }
|
|
const ids = await getProjectIds();
|
|
if (ids.length < 1) { this.skip(); return; }
|
|
const pid = ids[0];
|
|
await sendPromptInProject(pid, 'List the files in the current directory. Just list them, nothing else.');
|
|
try { await waitForAgentStatus(pid, 'running', 15_000); }
|
|
catch { console.log('Agent did not start'); this.skip(); return; }
|
|
await waitForAgentStatus(pid, 'idle', 120_000);
|
|
const messages = await getAgentMessages(pid);
|
|
const verdict = await assertWithJudge(
|
|
'The output should contain a file listing with at least one filename (like README.md or hello.py), not an error message.',
|
|
messages, { context: 'Agent was asked to list files in a directory containing README.md and hello.py' });
|
|
expect(verdict.pass).toBe(true);
|
|
if (!verdict.pass) console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`);
|
|
});
|
|
|
|
it('should produce response with appropriate tool usage', async function () {
|
|
if (!isJudgeAvailable()) { console.log(SKIP_MSG); this.skip(); return; }
|
|
const ids = await getProjectIds();
|
|
if (ids.length < 1) { this.skip(); return; }
|
|
const messages = await getAgentMessages(ids[0]);
|
|
const verdict = await assertWithJudge(
|
|
'The output should show evidence of tool usage (Bash, Read, Glob, etc.) — tool names, commands, or file paths.',
|
|
messages, { context: 'Agent tool calls rendered in collapsible sections with tool name and output' });
|
|
expect(verdict.pass).toBe(true);
|
|
if (!verdict.pass) console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`);
|
|
});
|
|
|
|
it('should show tool calls in collapsible groups during execution', async function () {
|
|
if (!isJudgeAvailable()) { console.log(SKIP_MSG); this.skip(); return; }
|
|
const ids = await getProjectIds();
|
|
if (ids.length < 1) { this.skip(); return; }
|
|
const messages = await getAgentMessages(ids[0]);
|
|
const verdict = await assertWithJudge(
|
|
'The output should contain tool call/result pairs as collapsible sections with tool names (Bash, Read, Glob, etc.).',
|
|
messages, { context: 'Tool calls rendered in <details> groups.' });
|
|
expect(verdict.pass).toBe(true);
|
|
});
|
|
|
|
it('should display cost after agent completes', async function () {
|
|
const ids = await getProjectIds();
|
|
if (ids.length < 1) return;
|
|
const pid = ids[0];
|
|
const status = await getAgentStatus(pid);
|
|
if (status === 'idle') {
|
|
const hasCost = await exec((id) => {
|
|
return document.querySelector(`[data-project-id="${id}"] .cost-bar, [data-project-id="${id}"] .usage-meter, [data-project-id="${id}"] [data-testid="agent-cost"]`) !== null;
|
|
}, pid);
|
|
expect(typeof hasCost).toBe('boolean');
|
|
}
|
|
});
|
|
|
|
it('should show model name used for response', async function () {
|
|
const ids = await getProjectIds();
|
|
if (ids.length < 1) return;
|
|
const pid = ids[0];
|
|
const modelInfo = await exec((id) => {
|
|
const box = document.querySelector(`[data-project-id="${id}"]`);
|
|
const modelEl = box?.querySelector('.model-name, .session-model, [data-testid="agent-model"]');
|
|
const strip = box?.querySelector('.status-strip');
|
|
return (modelEl?.textContent?.trim() ?? '') + (strip?.textContent?.trim() ?? '');
|
|
}, pid);
|
|
expect(typeof modelInfo).toBe('string');
|
|
});
|
|
});
|
|
|
|
// ─── Scenario B5: LLM-judged code generation quality ─────────────────
|
|
|
|
describe('Scenario B5 — LLM-Judged Code Generation', () => {
|
|
const SKIP_MSG = 'Skipping — LLM judge not available (no CLI or API key)';
|
|
|
|
it('should generate valid code when asked', async function () {
|
|
this.timeout(180_000);
|
|
if (!isJudgeAvailable()) { console.log(SKIP_MSG); this.skip(); return; }
|
|
const ids = await getProjectIds();
|
|
if (ids.length < 1) { this.skip(); return; }
|
|
const pid = ids[0];
|
|
await sendPromptInProject(pid, 'Read hello.py and tell me what the greet function does. One sentence answer.');
|
|
try { await waitForAgentStatus(pid, 'running', 15_000); }
|
|
catch { console.log('Agent did not start'); this.skip(); return; }
|
|
await waitForAgentStatus(pid, 'idle', 120_000);
|
|
const messages = await getAgentMessages(pid);
|
|
const verdict = await assertWithJudge(
|
|
'The response should describe the greet function taking a name and returning "Hello, {name}!" in roughly one sentence.',
|
|
messages, { context: 'hello.py contains: def greet(name: str) -> str:\n return f"Hello, {name}!"' });
|
|
expect(verdict.pass).toBe(true);
|
|
if (!verdict.pass) console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`);
|
|
});
|
|
|
|
it('should preserve session messages after tab switch and back', async function () {
|
|
const ids = await getProjectIds();
|
|
if (ids.length < 1) return;
|
|
const pid = ids[0];
|
|
const before = await getAgentMessages(pid);
|
|
await switchTab(pid, 3);
|
|
await browser.pause(500);
|
|
await switchTab(pid, 0);
|
|
await browser.pause(500);
|
|
const after = await getAgentMessages(pid);
|
|
if (before.length > 0) { expect(after).toBe(before); }
|
|
});
|
|
});
|
|
|
|
// ─── Scenario B6: Context tab reflects agent activity ────────────────
|
|
|
|
describe('Scenario B6 — Context Tab After Agent Activity', () => {
|
|
it('should show token usage in Context tab after agent ran', async () => {
|
|
const ids = await getProjectIds();
|
|
if (ids.length < 1) return;
|
|
const pid = ids[0];
|
|
await switchTab(pid, 2);
|
|
const content = await exec((id) => {
|
|
return document.querySelector(`[data-project-id="${id}"] .context-stats, [data-project-id="${id}"] .token-meter, [data-project-id="${id}"] .stat-value`)?.textContent ?? '';
|
|
}, pid);
|
|
if (content) { expect(content.length).toBeGreaterThan(0); }
|
|
await switchTab(pid, 0);
|
|
});
|
|
|
|
it('should show context tab token meter with non-zero tokens after agent activity', async () => {
|
|
const ids = await getProjectIds();
|
|
if (ids.length < 1) return;
|
|
const pid = ids[0];
|
|
await switchTab(pid, 2);
|
|
const tokenData = await exec((id) => {
|
|
const box = document.querySelector(`[data-project-id="${id}"]`);
|
|
const meter = box?.querySelector('.token-meter, .context-meter, [data-testid="token-meter"]');
|
|
const stats = box?.querySelectorAll('.stat-value');
|
|
return { meterExists: meter !== null, statCount: stats?.length ?? 0 };
|
|
}, pid);
|
|
if (tokenData.meterExists || tokenData.statCount > 0) {
|
|
expect(tokenData.statCount).toBeGreaterThan(0);
|
|
}
|
|
await switchTab(pid, 0);
|
|
});
|
|
|
|
it('should show file references in context tab after agent reads files', async () => {
|
|
const ids = await getProjectIds();
|
|
if (ids.length < 1) return;
|
|
const pid = ids[0];
|
|
await switchTab(pid, 2);
|
|
const refCount = await exec((id) => {
|
|
const box = document.querySelector(`[data-project-id="${id}"]`);
|
|
const refs = box?.querySelectorAll('.file-ref, .file-reference, [data-testid="file-refs"] li');
|
|
return refs?.length ?? 0;
|
|
}, pid);
|
|
if (refCount > 0) { expect(refCount).toBeGreaterThan(0); }
|
|
await switchTab(pid, 0);
|
|
});
|
|
});
|