Infrastructure: - adapters/: base, tauri (port 9750), electrobun (port 9761 + PTY daemon) - helpers/: 120+ centralized selectors, reusable actions, custom assertions - wdio.shared.conf.js + stack-specific configs 18 unified specs (205 tests): splash(6) smoke(15) settings(19) terminal(14) agent(15) search(12) files(15) comms(10) tasks(10) theme(12) groups(12) keyboard(8) notifications(10) diagnostics(8) status-bar(12) context(9) worktree(8) llm-judged(10) Daemon: --stack tauri|electrobun|both flag Scripts: test:e2e:tauri, test:e2e:electrobun, test:e2e:both
202 lines
7.4 KiB
TypeScript
202 lines
7.4 KiB
TypeScript
/**
|
|
* LLM-judged tests — uses Claude Haiku to evaluate UI quality.
|
|
*
|
|
* These tests are SKIPPED when ANTHROPIC_API_KEY is not set.
|
|
* They capture DOM snapshots and ask the LLM to judge correctness.
|
|
*/
|
|
|
|
import { browser, expect } from '@wdio/globals';
|
|
|
|
const API_KEY = process.env.ANTHROPIC_API_KEY;
|
|
const SKIP = !API_KEY;
|
|
|
|
async function askJudge(prompt: string): Promise<{ verdict: 'pass' | 'fail'; reasoning: string }> {
|
|
if (!API_KEY) return { verdict: 'pass', reasoning: 'Skipped — no API key' };
|
|
|
|
const res = await fetch('https://api.anthropic.com/v1/messages', {
|
|
method: 'POST',
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
'x-api-key': API_KEY,
|
|
'anthropic-version': '2023-06-01',
|
|
},
|
|
body: JSON.stringify({
|
|
model: 'claude-haiku-4-5-20250315',
|
|
max_tokens: 300,
|
|
messages: [{ role: 'user', content: prompt }],
|
|
}),
|
|
});
|
|
|
|
const data = await res.json();
|
|
const text = data.content?.[0]?.text ?? '';
|
|
|
|
try {
|
|
const parsed = JSON.parse(text);
|
|
return { verdict: parsed.verdict ?? 'pass', reasoning: parsed.reasoning ?? text };
|
|
} catch {
|
|
const isPass = text.toLowerCase().includes('pass');
|
|
return { verdict: isPass ? 'pass' : 'fail', reasoning: text };
|
|
}
|
|
}
|
|
|
|
describe('LLM-judged UI quality', () => {
|
|
it('should have complete settings panel', async function () {
|
|
if (SKIP) return this.skip();
|
|
|
|
const html = await browser.execute(() => {
|
|
const panel = document.querySelector('.settings-drawer')
|
|
?? document.querySelector('.sidebar-panel');
|
|
return panel?.innerHTML?.slice(0, 2000) ?? '';
|
|
});
|
|
|
|
const result = await askJudge(
|
|
`You are a UI testing judge. Given this settings panel HTML, does it contain reasonable settings categories (theme, font, projects, etc.)? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}\n\nHTML:\n${html}`
|
|
);
|
|
expect(result.verdict).toBe('pass');
|
|
});
|
|
|
|
it('should have visually consistent theme', async function () {
|
|
if (SKIP) return this.skip();
|
|
|
|
const vars = await browser.execute(() => {
|
|
const s = getComputedStyle(document.documentElement);
|
|
return {
|
|
base: s.getPropertyValue('--ctp-base').trim(),
|
|
text: s.getPropertyValue('--ctp-text').trim(),
|
|
surface0: s.getPropertyValue('--ctp-surface0').trim(),
|
|
blue: s.getPropertyValue('--ctp-blue').trim(),
|
|
};
|
|
});
|
|
|
|
const result = await askJudge(
|
|
`You are a UI theme judge. Given these CSS custom property values from a dark-theme app, do they form a visually consistent palette? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}\n\nVariables: ${JSON.stringify(vars)}`
|
|
);
|
|
expect(result.verdict).toBe('pass');
|
|
});
|
|
|
|
it('should have proper error handling in UI', async function () {
|
|
if (SKIP) return this.skip();
|
|
|
|
const toasts = await browser.execute(() => {
|
|
return document.querySelectorAll('.toast-error, .load-error').length;
|
|
});
|
|
|
|
const result = await askJudge(
|
|
`A UI app shows ${toasts} error toasts after loading. For a freshly launched test instance, is 0-1 errors acceptable? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}`
|
|
);
|
|
expect(result.verdict).toBe('pass');
|
|
});
|
|
|
|
it('should have readable text contrast', async function () {
|
|
if (SKIP) return this.skip();
|
|
|
|
const colors = await browser.execute(() => {
|
|
const body = getComputedStyle(document.body);
|
|
return {
|
|
bg: body.backgroundColor,
|
|
text: body.color,
|
|
font: body.fontFamily,
|
|
size: body.fontSize,
|
|
};
|
|
});
|
|
|
|
const result = await askJudge(
|
|
`You are an accessibility judge. Given body background="${colors.bg}", text color="${colors.text}", font="${colors.font}", size="${colors.size}" — does this have adequate contrast for readability? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}`
|
|
);
|
|
expect(result.verdict).toBe('pass');
|
|
});
|
|
|
|
it('should have well-structured project cards', async function () {
|
|
if (SKIP) return this.skip();
|
|
|
|
const html = await browser.execute(() => {
|
|
const card = document.querySelector('.project-card');
|
|
return card?.innerHTML?.slice(0, 1500) ?? '';
|
|
});
|
|
|
|
if (!html) return;
|
|
|
|
const result = await askJudge(
|
|
`You are a UI judge. Does this project card HTML contain expected sections (header, agent/terminal area, tabs)? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}\n\nHTML:\n${html}`
|
|
);
|
|
expect(result.verdict).toBe('pass');
|
|
});
|
|
|
|
it('should have consistent layout structure', async function () {
|
|
if (SKIP) return this.skip();
|
|
|
|
const layout = await browser.execute(() => {
|
|
const el = document.querySelector('.app-shell') ?? document.body;
|
|
const children = Array.from(el.children).map(c => ({
|
|
tag: c.tagName,
|
|
cls: c.className?.split(' ').slice(0, 3).join(' '),
|
|
w: c.getBoundingClientRect().width,
|
|
h: c.getBoundingClientRect().height,
|
|
}));
|
|
return children;
|
|
});
|
|
|
|
const result = await askJudge(
|
|
`You are a layout judge. This app has these top-level children: ${JSON.stringify(layout)}. Does this look like a reasonable app layout (sidebar, main content, status bar)? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}`
|
|
);
|
|
expect(result.verdict).toBe('pass');
|
|
});
|
|
|
|
it('should have accessible interactive elements', async function () {
|
|
if (SKIP) return this.skip();
|
|
|
|
const stats = await browser.execute(() => {
|
|
const buttons = document.querySelectorAll('button');
|
|
const withLabel = Array.from(buttons).filter(b =>
|
|
b.textContent?.trim() || b.getAttribute('aria-label') || b.getAttribute('title')
|
|
).length;
|
|
return { total: buttons.length, withLabel };
|
|
});
|
|
|
|
const result = await askJudge(
|
|
`An app has ${stats.total} buttons, ${stats.withLabel} have text/aria-label/title. Is the labeling ratio (${Math.round(stats.withLabel / Math.max(stats.total, 1) * 100)}%) acceptable? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}`
|
|
);
|
|
expect(result.verdict).toBe('pass');
|
|
});
|
|
|
|
it('should render without JS errors', async function () {
|
|
if (SKIP) return this.skip();
|
|
|
|
// Check console for errors (if available)
|
|
const errorCount = await browser.execute(() => {
|
|
return document.querySelectorAll('.toast-error, .load-error, .error-boundary').length;
|
|
});
|
|
|
|
expect(errorCount).toBeLessThanOrEqual(1);
|
|
});
|
|
|
|
it('should have responsive grid layout', async function () {
|
|
if (SKIP) return this.skip();
|
|
|
|
const grid = await browser.execute(() => {
|
|
const el = document.querySelector('.project-grid');
|
|
if (!el) return null;
|
|
const rect = el.getBoundingClientRect();
|
|
return { width: rect.width, height: rect.height, display: getComputedStyle(el).display };
|
|
});
|
|
|
|
if (!grid) return;
|
|
expect(grid.width).toBeGreaterThan(0);
|
|
expect(grid.height).toBeGreaterThan(0);
|
|
});
|
|
|
|
it('should have status bar with meaningful content', async function () {
|
|
if (SKIP) return this.skip();
|
|
|
|
const content = await browser.execute(() => {
|
|
const bar = document.querySelector('[data-testid="status-bar"]')
|
|
?? document.querySelector('.status-bar');
|
|
return bar?.textContent?.trim() ?? '';
|
|
});
|
|
|
|
const result = await askJudge(
|
|
`A status bar contains this text: "${content.slice(0, 500)}". Does it contain useful info (version, agent status, cost, etc.)? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}`
|
|
);
|
|
expect(result.verdict).toBe('pass');
|
|
});
|
|
});
|