/** * LLM-judged tests — uses Claude Haiku to evaluate UI quality. * * These tests are SKIPPED when ANTHROPIC_API_KEY is not set. * They capture DOM snapshots and ask the LLM to judge correctness. */ import { browser, expect } from '@wdio/globals'; const API_KEY = process.env.ANTHROPIC_API_KEY; const SKIP = !API_KEY; async function askJudge(prompt: string): Promise<{ verdict: 'pass' | 'fail'; reasoning: string }> { if (!API_KEY) return { verdict: 'pass', reasoning: 'Skipped — no API key' }; const res = await fetch('https://api.anthropic.com/v1/messages', { method: 'POST', headers: { 'Content-Type': 'application/json', 'x-api-key': API_KEY, 'anthropic-version': '2023-06-01', }, body: JSON.stringify({ model: 'claude-haiku-4-5-20250315', max_tokens: 300, messages: [{ role: 'user', content: prompt }], }), }); const data = await res.json(); const text = data.content?.[0]?.text ?? ''; try { const parsed = JSON.parse(text); return { verdict: parsed.verdict ?? 'pass', reasoning: parsed.reasoning ?? text }; } catch { const isPass = text.toLowerCase().includes('pass'); return { verdict: isPass ? 'pass' : 'fail', reasoning: text }; } } describe('LLM-judged UI quality', () => { it('should have complete settings panel', async function () { if (SKIP) return this.skip(); const html = await browser.execute(() => { const panel = document.querySelector('.settings-drawer') ?? document.querySelector('.sidebar-panel'); return panel?.innerHTML?.slice(0, 2000) ?? ''; }); const result = await askJudge( `You are a UI testing judge. Given this settings panel HTML, does it contain reasonable settings categories (theme, font, projects, etc.)? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}\n\nHTML:\n${html}` ); expect(result.verdict).toBe('pass'); }); it('should have visually consistent theme', async function () { if (SKIP) return this.skip(); const vars = await browser.execute(() => { const s = getComputedStyle(document.documentElement); return { base: s.getPropertyValue('--ctp-base').trim(), text: s.getPropertyValue('--ctp-text').trim(), surface0: s.getPropertyValue('--ctp-surface0').trim(), blue: s.getPropertyValue('--ctp-blue').trim(), }; }); const result = await askJudge( `You are a UI theme judge. Given these CSS custom property values from a dark-theme app, do they form a visually consistent palette? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}\n\nVariables: ${JSON.stringify(vars)}` ); expect(result.verdict).toBe('pass'); }); it('should have proper error handling in UI', async function () { if (SKIP) return this.skip(); const toasts = await browser.execute(() => { return document.querySelectorAll('.toast-error, .load-error').length; }); const result = await askJudge( `A UI app shows ${toasts} error toasts after loading. For a freshly launched test instance, is 0-1 errors acceptable? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}` ); expect(result.verdict).toBe('pass'); }); it('should have readable text contrast', async function () { if (SKIP) return this.skip(); const colors = await browser.execute(() => { const body = getComputedStyle(document.body); return { bg: body.backgroundColor, text: body.color, font: body.fontFamily, size: body.fontSize, }; }); const result = await askJudge( `You are an accessibility judge. Given body background="${colors.bg}", text color="${colors.text}", font="${colors.font}", size="${colors.size}" — does this have adequate contrast for readability? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}` ); expect(result.verdict).toBe('pass'); }); it('should have well-structured project cards', async function () { if (SKIP) return this.skip(); const html = await browser.execute(() => { const card = document.querySelector('.project-card'); return card?.innerHTML?.slice(0, 1500) ?? ''; }); if (!html) return; const result = await askJudge( `You are a UI judge. Does this project card HTML contain expected sections (header, agent/terminal area, tabs)? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}\n\nHTML:\n${html}` ); expect(result.verdict).toBe('pass'); }); it('should have consistent layout structure', async function () { if (SKIP) return this.skip(); const layout = await browser.execute(() => { const el = document.querySelector('.app-shell') ?? document.body; const children = Array.from(el.children).map(c => ({ tag: c.tagName, cls: c.className?.split(' ').slice(0, 3).join(' '), w: c.getBoundingClientRect().width, h: c.getBoundingClientRect().height, })); return children; }); const result = await askJudge( `You are a layout judge. This app has these top-level children: ${JSON.stringify(layout)}. Does this look like a reasonable app layout (sidebar, main content, status bar)? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}` ); expect(result.verdict).toBe('pass'); }); it('should have accessible interactive elements', async function () { if (SKIP) return this.skip(); const stats = await browser.execute(() => { const buttons = document.querySelectorAll('button'); const withLabel = Array.from(buttons).filter(b => b.textContent?.trim() || b.getAttribute('aria-label') || b.getAttribute('title') ).length; return { total: buttons.length, withLabel }; }); const result = await askJudge( `An app has ${stats.total} buttons, ${stats.withLabel} have text/aria-label/title. Is the labeling ratio (${Math.round(stats.withLabel / Math.max(stats.total, 1) * 100)}%) acceptable? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}` ); expect(result.verdict).toBe('pass'); }); it('should render without JS errors', async function () { if (SKIP) return this.skip(); // Check console for errors (if available) const errorCount = await browser.execute(() => { return document.querySelectorAll('.toast-error, .load-error, .error-boundary').length; }); expect(errorCount).toBeLessThanOrEqual(1); }); it('should have responsive grid layout', async function () { if (SKIP) return this.skip(); const grid = await browser.execute(() => { const el = document.querySelector('.project-grid'); if (!el) return null; const rect = el.getBoundingClientRect(); return { width: rect.width, height: rect.height, display: getComputedStyle(el).display }; }); if (!grid) return; expect(grid.width).toBeGreaterThan(0); expect(grid.height).toBeGreaterThan(0); }); it('should have status bar with meaningful content', async function () { if (SKIP) return this.skip(); const content = await browser.execute(() => { const bar = document.querySelector('[data-testid="status-bar"]') ?? document.querySelector('.status-bar'); return bar?.textContent?.trim() ?? ''; }); const result = await askJudge( `A status bar contains this text: "${content.slice(0, 500)}". Does it contain useful info (version, agent status, cost, etc.)? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}` ); expect(result.verdict).toBe('pass'); }); });