agent-orchestrator/tests/e2e/specs/llm-judged.test.ts

/**
 * LLM-judged tests — uses Claude Haiku to evaluate UI quality.
 *
 * These tests are SKIPPED when ANTHROPIC_API_KEY is not set.
 * They capture DOM snapshots and ask the LLM to judge correctness.
 */

import { browser, expect } from '@wdio/globals';

const API_KEY = process.env.ANTHROPIC_API_KEY;
const SKIP = !API_KEY;

async function askJudge(prompt: string): Promise<{ verdict: 'pass' | 'fail'; reasoning: string }> {
  if (!API_KEY) return { verdict: 'pass', reasoning: 'Skipped — no API key' };

  const res = await fetch('https://api.anthropic.com/v1/messages', {
    method: 'POST',
    headers: {
      'Content-Type': 'application/json',
      'x-api-key': API_KEY,
      'anthropic-version': '2023-06-01',
    },
    body: JSON.stringify({
      model: 'claude-haiku-4-5-20250315',
      max_tokens: 300,
      messages: [{ role: 'user', content: prompt }],
    }),
  });

  const data = await res.json();
  const text = data.content?.[0]?.text ?? '';

  try {
    const parsed = JSON.parse(text);
    return { verdict: parsed.verdict ?? 'pass', reasoning: parsed.reasoning ?? text };
  } catch {
    const isPass = text.toLowerCase().includes('pass');
    return { verdict: isPass ? 'pass' : 'fail', reasoning: text };
  }
}

describe('LLM-judged UI quality', () => {
  it('should have complete settings panel', async function () {
    if (SKIP) return this.skip();

    const html = await browser.execute(() => {
      const panel = document.querySelector('.settings-drawer')
        ?? document.querySelector('.sidebar-panel');
      return panel?.innerHTML?.slice(0, 2000) ?? '';
    });

    const result = await askJudge(
      `You are a UI testing judge. Given this settings panel HTML, does it contain reasonable settings categories (theme, font, projects, etc.)? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}\n\nHTML:\n${html}`
    );
    expect(result.verdict).toBe('pass');
  });

  it('should have visually consistent theme', async function () {
    if (SKIP) return this.skip();

    const vars = await browser.execute(() => {
      const s = getComputedStyle(document.documentElement);
      return {
        base: s.getPropertyValue('--ctp-base').trim(),
        text: s.getPropertyValue('--ctp-text').trim(),
        surface0: s.getPropertyValue('--ctp-surface0').trim(),
        blue: s.getPropertyValue('--ctp-blue').trim(),
      };
    });

    const result = await askJudge(
      `You are a UI theme judge. Given these CSS custom property values from a dark-theme app, do they form a visually consistent palette? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}\n\nVariables: ${JSON.stringify(vars)}`
    );
    expect(result.verdict).toBe('pass');
  });

  it('should have proper error handling in UI', async function () {
    if (SKIP) return this.skip();

    const toasts = await browser.execute(() => {
      return document.querySelectorAll('.toast-error, .load-error').length;
    });

    const result = await askJudge(
      `A UI app shows ${toasts} error toasts after loading. For a freshly launched test instance, is 0-1 errors acceptable? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}`
    );
    expect(result.verdict).toBe('pass');
  });

  it('should have readable text contrast', async function () {
    if (SKIP) return this.skip();

    const colors = await browser.execute(() => {
      const body = getComputedStyle(document.body);
      return {
        bg: body.backgroundColor,
        text: body.color,
        font: body.fontFamily,
        size: body.fontSize,
      };
    });

    const result = await askJudge(
      `You are an accessibility judge. Given body background="${colors.bg}", text color="${colors.text}", font="${colors.font}", size="${colors.size}" — does this have adequate contrast for readability? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}`
    );
    expect(result.verdict).toBe('pass');
  });

  it('should have well-structured project cards', async function () {
    if (SKIP) return this.skip();

    const html = await browser.execute(() => {
      const card = document.querySelector('.project-card');
      return card?.innerHTML?.slice(0, 1500) ?? '';
    });

    if (!html) return;

    const result = await askJudge(
      `You are a UI judge. Does this project card HTML contain expected sections (header, agent/terminal area, tabs)? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}\n\nHTML:\n${html}`
    );
    expect(result.verdict).toBe('pass');
  });

  it('should have consistent layout structure', async function () {
    if (SKIP) return this.skip();

    const layout = await browser.execute(() => {
      const el = document.querySelector('.app-shell') ?? document.body;
      const children = Array.from(el.children).map(c => ({
        tag: c.tagName,
        cls: c.className?.split(' ').slice(0, 3).join(' '),
        w: c.getBoundingClientRect().width,
        h: c.getBoundingClientRect().height,
      }));
      return children;
    });

    const result = await askJudge(
      `You are a layout judge. This app has these top-level children: ${JSON.stringify(layout)}. Does this look like a reasonable app layout (sidebar, main content, status bar)? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}`
    );
    expect(result.verdict).toBe('pass');
  });

  it('should have accessible interactive elements', async function () {
    if (SKIP) return this.skip();

    const stats = await browser.execute(() => {
      const buttons = document.querySelectorAll('button');
      const withLabel = Array.from(buttons).filter(b =>
        b.textContent?.trim() || b.getAttribute('aria-label') || b.getAttribute('title')
      ).length;
      return { total: buttons.length, withLabel };
    });

    const result = await askJudge(
      `An app has ${stats.total} buttons, ${stats.withLabel} have text/aria-label/title. Is the labeling ratio (${Math.round(stats.withLabel / Math.max(stats.total, 1) * 100)}%) acceptable? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}`
    );
    expect(result.verdict).toBe('pass');
  });

  it('should render without JS errors', async function () {
    if (SKIP) return this.skip();

    // Check console for errors (if available)
    const errorCount = await browser.execute(() => {
      return document.querySelectorAll('.toast-error, .load-error, .error-boundary').length;
    });

    expect(errorCount).toBeLessThanOrEqual(1);
  });

  it('should have responsive grid layout', async function () {
    if (SKIP) return this.skip();

    const grid = await browser.execute(() => {
      const el = document.querySelector('.project-grid');
      if (!el) return null;
      const rect = el.getBoundingClientRect();
      return { width: rect.width, height: rect.height, display: getComputedStyle(el).display };
    });

    if (!grid) return;
    expect(grid.width).toBeGreaterThan(0);
    expect(grid.height).toBeGreaterThan(0);
  });

  it('should have status bar with meaningful content', async function () {
    if (SKIP) return this.skip();

    const content = await browser.execute(() => {
      const bar = document.querySelector('[data-testid="status-bar"]')
        ?? document.querySelector('.status-bar');
      return bar?.textContent?.trim() ?? '';
    });

    const result = await askJudge(
      `A status bar contains this text: "${content.slice(0, 500)}". Does it contain useful info (version, agent status, cost, etc.)? Reply with JSON: {"verdict":"pass"|"fail","reasoning":"..."}`
    );
    expect(result.verdict).toBe('pass');
  });
});