agent-orchestrator/tests/e2e/specs/llm-judged.test.ts
Hibryda 77b9ce9f62 feat: unified E2E testing engine — 205 tests, dual-stack support
Infrastructure:
- adapters/: base, tauri (port 9750), electrobun (port 9761 + PTY daemon)
- helpers/: 120+ centralized selectors, reusable actions, custom assertions
- wdio.shared.conf.js + stack-specific configs

18 unified specs (205 tests):
splash(6) smoke(15) settings(19) terminal(14) agent(15) search(12)
files(15) comms(10) tasks(10) theme(12) groups(12) keyboard(8)
notifications(10) diagnostics(8) status-bar(12) context(9)
worktree(8) llm-judged(10)

Daemon: --stack tauri|electrobun|both flag
Scripts: test:e2e:tauri, test:e2e:electrobun, test:e2e:both
2026-03-22 05:27:36 +01:00

202 lines
7.4 KiB
TypeScript

/**
* LLM-judged tests — uses Claude Haiku to evaluate UI quality.
*
* These tests are SKIPPED when ANTHROPIC_API_KEY is not set.
* They capture DOM snapshots and ask the LLM to judge correctness.
*/
import { browser, expect } from '@wdio/globals';
const API_KEY = process.env.ANTHROPIC_API_KEY;
const SKIP = !API_KEY;
async function askJudge(prompt: string): Promise<{ verdict: 'pass' | 'fail'; reasoning: string }> {
if (!API_KEY) return { verdict: 'pass', reasoning: 'Skipped — no API key' };
const res = await fetch('https://api.anthropic.com/v1/messages', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'x-api-key': API_KEY,
'anthropic-version': '2023-06-01',
},
body: JSON.stringify({
model: 'claude-haiku-4-5-20250315',
max_tokens: 300,
messages: [{ role: 'user', content: prompt }],
}),
});
const data = await res.json();
const text = data.content?.[0]?.text ?? '';
try {
const parsed = JSON.parse(text);
return { verdict: parsed.verdict ?? 'pass', reasoning: parsed.reasoning ?? text };
} catch {
const isPass = text.toLowerCase().includes('pass');
return { verdict: isPass ? 'pass' : 'fail', reasoning: text };
}
}
describe('LLM-judged UI quality', () => {
it('should have complete settings panel', async function () {
if (SKIP) return this.skip();
const html = await browser.execute(() => {
const panel = document.querySelector('.settings-drawer')
?? document.querySelector('.sidebar-panel');
return panel?.innerHTML?.slice(0, 2000) ?? '';
});
const result = await askJudge(
`You are a UI testing judge. Given this settings panel HTML, does it contain reasonable settings categories (theme, font, projects, etc.)? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}\n\nHTML:\n${html}`
);
expect(result.verdict).toBe('pass');
});
it('should have visually consistent theme', async function () {
if (SKIP) return this.skip();
const vars = await browser.execute(() => {
const s = getComputedStyle(document.documentElement);
return {
base: s.getPropertyValue('--ctp-base').trim(),
text: s.getPropertyValue('--ctp-text').trim(),
surface0: s.getPropertyValue('--ctp-surface0').trim(),
blue: s.getPropertyValue('--ctp-blue').trim(),
};
});
const result = await askJudge(
`You are a UI theme judge. Given these CSS custom property values from a dark-theme app, do they form a visually consistent palette? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}\n\nVariables: ${JSON.stringify(vars)}`
);
expect(result.verdict).toBe('pass');
});
it('should have proper error handling in UI', async function () {
if (SKIP) return this.skip();
const toasts = await browser.execute(() => {
return document.querySelectorAll('.toast-error, .load-error').length;
});
const result = await askJudge(
`A UI app shows ${toasts} error toasts after loading. For a freshly launched test instance, is 0-1 errors acceptable? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}`
);
expect(result.verdict).toBe('pass');
});
it('should have readable text contrast', async function () {
if (SKIP) return this.skip();
const colors = await browser.execute(() => {
const body = getComputedStyle(document.body);
return {
bg: body.backgroundColor,
text: body.color,
font: body.fontFamily,
size: body.fontSize,
};
});
const result = await askJudge(
`You are an accessibility judge. Given body background="${colors.bg}", text color="${colors.text}", font="${colors.font}", size="${colors.size}" — does this have adequate contrast for readability? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}`
);
expect(result.verdict).toBe('pass');
});
it('should have well-structured project cards', async function () {
if (SKIP) return this.skip();
const html = await browser.execute(() => {
const card = document.querySelector('.project-card');
return card?.innerHTML?.slice(0, 1500) ?? '';
});
if (!html) return;
const result = await askJudge(
`You are a UI judge. Does this project card HTML contain expected sections (header, agent/terminal area, tabs)? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}\n\nHTML:\n${html}`
);
expect(result.verdict).toBe('pass');
});
it('should have consistent layout structure', async function () {
if (SKIP) return this.skip();
const layout = await browser.execute(() => {
const el = document.querySelector('.app-shell') ?? document.body;
const children = Array.from(el.children).map(c => ({
tag: c.tagName,
cls: c.className?.split(' ').slice(0, 3).join(' '),
w: c.getBoundingClientRect().width,
h: c.getBoundingClientRect().height,
}));
return children;
});
const result = await askJudge(
`You are a layout judge. This app has these top-level children: ${JSON.stringify(layout)}. Does this look like a reasonable app layout (sidebar, main content, status bar)? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}`
);
expect(result.verdict).toBe('pass');
});
it('should have accessible interactive elements', async function () {
if (SKIP) return this.skip();
const stats = await browser.execute(() => {
const buttons = document.querySelectorAll('button');
const withLabel = Array.from(buttons).filter(b =>
b.textContent?.trim() || b.getAttribute('aria-label') || b.getAttribute('title')
).length;
return { total: buttons.length, withLabel };
});
const result = await askJudge(
`An app has ${stats.total} buttons, ${stats.withLabel} have text/aria-label/title. Is the labeling ratio (${Math.round(stats.withLabel / Math.max(stats.total, 1) * 100)}%) acceptable? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}`
);
expect(result.verdict).toBe('pass');
});
it('should render without JS errors', async function () {
if (SKIP) return this.skip();
// Check console for errors (if available)
const errorCount = await browser.execute(() => {
return document.querySelectorAll('.toast-error, .load-error, .error-boundary').length;
});
expect(errorCount).toBeLessThanOrEqual(1);
});
it('should have responsive grid layout', async function () {
if (SKIP) return this.skip();
const grid = await browser.execute(() => {
const el = document.querySelector('.project-grid');
if (!el) return null;
const rect = el.getBoundingClientRect();
return { width: rect.width, height: rect.height, display: getComputedStyle(el).display };
});
if (!grid) return;
expect(grid.width).toBeGreaterThan(0);
expect(grid.height).toBeGreaterThan(0);
});
it('should have status bar with meaningful content', async function () {
if (SKIP) return this.skip();
const content = await browser.execute(() => {
const bar = document.querySelector('[data-testid="status-bar"]')
?? document.querySelector('.status-bar');
return bar?.textContent?.trim() ?? '';
});
const result = await askJudge(
`A status bar contains this text: "${content.slice(0, 500)}". Does it contain useful info (version, agent status, cost, etc.)? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}`
);
expect(result.verdict).toBe('pass');
});
});