agent-orchestrator/tests/e2e/specs/phase-f-llm.test.ts

import { browser, expect } from '@wdio/globals';
import { isJudgeAvailable, assertWithJudge } from '../infra/llm-judge';
import { exec } from '../helpers/execute.ts';

// Phase F — LLM-Judged Tests (F4–F7)
// Settings completeness, theme system quality, error handling, and UI consistency.

// ─── Helpers ──────────────────────────────────────────────────────────

/** Open settings panel and wait for content to render. */
async function openSettings(): Promise<void> {
  const panel = await browser.$('.settings-panel');
  const isOpen = await panel.isDisplayed().catch(() => false);
  if (!isOpen) {
    await exec(() => {
      const btn = document.querySelector('[data-testid="settings-btn"]');
      if (btn) (btn as HTMLElement).click();
    });
    await browser.pause(500);
    await browser.waitUntil(
      async () => {
        const el = await browser.$('.settings-panel');
        return el.isDisplayed().catch(() => false);
      },
      { timeout: 5000, timeoutMsg: 'Settings panel did not open within 5s' },
    );
  }
  await browser.pause(300);
}

/** Close settings panel. */
async function closeSettings(): Promise<void> {
  const panel = await browser.$('.settings-panel');
  if (await panel.isDisplayed().catch(() => false)) {
    await exec(() => {
      const btn = document.querySelector('.settings-close, .panel-close');
      if (btn) (btn as HTMLElement).click();
    });
    await browser.pause(400);
  }
}

/** Click a settings category by label text. */
async function clickSettingsCategory(label: string): Promise<boolean> {
  return exec((lbl) => {
    const items = document.querySelectorAll('.settings-sidebar button, .settings-sidebar [role="tab"]');
    for (const item of items) {
      if (item.textContent?.includes(lbl)) {
        (item as HTMLElement).click();
        return true;
      }
    }
    return false;
  }, label);
}

/** Get visible text content of settings content area. */
async function getSettingsContent(): Promise<string> {
  return exec(() => {
    const content = document.querySelector('.settings-content, .settings-panel');
    return content?.textContent ?? '';
  });
}

const SKIP_MSG = 'Skipping — LLM judge not available (no CLI or API key)';

// ─── Scenario F4: LLM-Judged Settings Completeness (Extended) ────────

describe('Scenario F4 — LLM-Judged Settings Completeness', () => {
  after(async () => {
    await closeSettings();
  });

  it('should have all 6 settings categories with meaningful content', async function () {
    if (!isJudgeAvailable()) {
      console.log(SKIP_MSG);
      this.skip();
      return;
    }

    await openSettings();

    // Collect content from each category
    const categories = ['Appearance', 'Agents', 'Security', 'Projects', 'Orchestration', 'Advanced'];
    const categoryContents: Record<string, string> = {};

    for (const cat of categories) {
      const clicked = await clickSettingsCategory(cat);
      if (clicked) {
        await browser.pause(300);
        categoryContents[cat] = await getSettingsContent();
      } else {
        categoryContents[cat] = '(category not found in sidebar)';
      }
    }

    const summary = Object.entries(categoryContents)
      .map(([cat, text]) => `## ${cat}\n${text.slice(0, 500)}`)
      .join('\n\n');

    const verdict = await assertWithJudge(
      'The settings panel should have 6 categories: Appearance, Agents, Security, Projects, Orchestration, and Advanced. Each category should have at least 2 configurable settings visible (dropdowns, inputs, toggles, sliders, etc.). Are all categories populated with real settings, not empty or error states?',
      summary,
      { context: 'AGOR v3 settings panel with sidebar navigation between 6 categories. Each has dedicated settings components.' },
    );

    expect(verdict.pass).toBe(true);
    if (!verdict.pass) {
      console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`);
    }
  });
});

// ─── Scenario F5: LLM-Judged Theme System Quality ────────────────────

describe('Scenario F5 — LLM-Judged Theme System Quality', () => {
  after(async () => {
    await closeSettings();
  });

  it('should present a comprehensive theme selection interface', async function () {
    if (!isJudgeAvailable()) {
      console.log(SKIP_MSG);
      this.skip();
      return;
    }

    await openSettings();
    await clickSettingsCategory('Appearance');
    await browser.pause(300);

    // Open theme dropdown to capture options
    await exec(() => {
      const trigger = document.querySelector('.custom-dropdown .dropdown-trigger');
      if (trigger) (trigger as HTMLElement).click();
    });
    await browser.pause(300);

    const themeHtml = await exec(() => {
      const panel = document.querySelector('.settings-content, .settings-panel');
      if (!panel) return '';
      // Get appearance section HTML for structure analysis
      return panel.innerHTML.slice(0, 3000);
    });

    // Close dropdown
    await exec(() => document.body.click());
    await browser.pause(200);

    const verdict = await assertWithJudge(
      'This is the Appearance settings section of a desktop app. Does it have: (1) a theme selector with multiple theme options organized in groups (Catppuccin, Editor, Deep Dark), (2) font settings for both UI and terminal with family dropdowns and size controls, (3) visual organization with clear labels and sections? It should look like a polished settings interface.',
      themeHtml,
      { context: 'AGOR v3 has 17 themes in 3 groups (4 Catppuccin + 7 Editor + 6 Deep Dark), custom dropdown UI, UI font + terminal font dropdowns with size steppers.' },
    );

    expect(verdict.pass).toBe(true);
    if (!verdict.pass) {
      console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`);
    }
  });
});

// ─── Scenario F6: LLM-Judged Error Handling Quality ──────────────────

describe('Scenario F6 — LLM-Judged Error Handling Quality', () => {
  it('should show user-friendly error messages, not raw errors', async function () {
    if (!isJudgeAvailable()) {
      console.log(SKIP_MSG);
      this.skip();
      return;
    }

    // Capture any visible toast notifications, error states, or warnings
    const errorContent = await exec(() => {
      const results: string[] = [];

      // Check toast notifications
      const toasts = document.querySelectorAll('.toast, .notification, [data-testid="toast"]');
      toasts.forEach(t => results.push(`Toast: ${t.textContent?.trim()}`));

      // Check for error states in agent panes
      const errorEls = document.querySelectorAll('.error, .error-message, [data-agent-status="error"]');
      errorEls.forEach(e => results.push(`Error: ${e.textContent?.trim()}`));

      // Check status bar for error indicators
      const statusBar = document.querySelector('[data-testid="status-bar"]');
      if (statusBar) results.push(`StatusBar: ${statusBar.textContent?.trim()}`);

      // Check for any visible alerts or warnings
      const alerts = document.querySelectorAll('[role="alert"], .alert, .warning');
      alerts.forEach(a => results.push(`Alert: ${a.textContent?.trim()}`));

      return results.length > 0 ? results.join('\n') : 'No error messages currently visible. The app is in a clean state.';
    });

    const verdict = await assertWithJudge(
      'These are the currently visible error/notification/status messages from a desktop developer tools app. Evaluate: (1) Are any messages raw stack traces or "[object Object]"? (2) If error messages exist, are they user-friendly with actionable guidance? (3) If no errors are visible, is that a reasonable state for an app with idle agents? The app should NOT show raw internal errors to users.',
      errorContent,
      { context: 'AGOR v3 uses toast notifications for agent events, status bar for fleet state. Error classifier categorizes API errors into 6 types with user-friendly messages.' },
    );

    expect(verdict.pass).toBe(true);
    if (!verdict.pass) {
      console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`);
    }
  });
});

// ─── Scenario F7: LLM-Judged Overall UI Quality ─────────────────────

describe('Scenario F7 — LLM-Judged Overall UI Quality', () => {
  it('should present a professional, consistent dark-theme UI', async function () {
    if (!isJudgeAvailable()) {
      console.log(SKIP_MSG);
      this.skip();
      return;
    }

    // Capture full page structure and key visual elements
    const uiSnapshot = await exec(() => {
      const elements: string[] = [];

      // Sidebar rail
      const rail = document.querySelector('[data-testid="sidebar-rail"]');
      if (rail) elements.push(`Sidebar: ${rail.innerHTML.slice(0, 300)}`);

      // Status bar
      const bar = document.querySelector('[data-testid="status-bar"]');
      if (bar) elements.push(`StatusBar: ${bar.innerHTML.slice(0, 500)}`);

      // Project boxes
      const boxes = document.querySelectorAll('[data-testid="project-box"]');
      elements.push(`ProjectBoxes: ${boxes.length} rendered`);
      if (boxes[0]) {
        const header = boxes[0].querySelector('.project-header');
        const tabs = boxes[0].querySelector('[data-testid="project-tabs"]');
        if (header) elements.push(`Header: ${header.innerHTML.slice(0, 300)}`);
        if (tabs) elements.push(`Tabs: ${tabs.innerHTML.slice(0, 400)}`);
      }

      // Overall body styles
      const body = document.body;
      const styles = window.getComputedStyle(body);
      elements.push(`Body bg: ${styles.backgroundColor}, color: ${styles.color}, font: ${styles.fontFamily.slice(0, 60)}`);

      // Check CSS custom properties are applied
      const root = document.documentElement;
      const rootStyles = window.getComputedStyle(root);
      const ctp = rootStyles.getPropertyValue('--ctp-base');
      elements.push(`Theme var --ctp-base: ${ctp || 'not set'}`);

      return elements.join('\n\n');
    });

    const verdict = await assertWithJudge(
      'This is a structural snapshot of a developer tools dashboard UI. Rate the visual consistency: (1) Are CSS custom properties (--ctp-*) being used for theming (indicating consistent color system)? (2) Does the layout have clear structure (sidebar, status bar, project boxes with tabs)? (3) Is the font family set to a proper UI font (not monospace for the main UI)? (4) Is the information hierarchy clear (header, tabs, content areas)? A professional app should have all of these.',
      uiSnapshot,
      { context: 'AGOR v3 uses Catppuccin theme system with 26 --ctp-* CSS vars, VSCode-style sidebar layout, sans-serif UI font, project boxes with tab bars.' },
    );

    expect(verdict.pass).toBe(true);
    if (!verdict.pass) {
      console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`);
    }
  });
});