agent-orchestrator/tests/e2e/specs/phase-b-llm.test.ts

import { browser, expect } from '@wdio/globals';
import { isJudgeAvailable, assertWithJudge } from '../infra/llm-judge';

// Phase B — LLM: LLM-judged agent responses, code generation, context tab.
// Scenarios B4-B6 + new agent/context tests. Requires ANTHROPIC_API_KEY for LLM tests.

// ─── Helpers ──────────────────────────────────────────────────────────

async function getProjectIds(): Promise<string[]> {
  return browser.execute(() => {
    return Array.from(document.querySelectorAll('[data-testid="project-box"]'))
      .map((b) => b.getAttribute('data-project-id') ?? '').filter(Boolean);
  });
}
async function focusProject(id: string): Promise<void> {
  await browser.execute((pid) => {
    (document.querySelector(`[data-project-id="${pid}"] .project-header`) as HTMLElement)?.click();
  }, id);
  await browser.pause(300);
}
async function getAgentStatus(id: string): Promise<string> {
  return browser.execute((pid) =>
    document.querySelector(`[data-project-id="${pid}"] [data-testid="agent-pane"]`)?.getAttribute('data-agent-status') ?? 'not-found', id);
}
async function sendPromptInProject(id: string, text: string): Promise<void> {
  await focusProject(id);
  await browser.execute((pid, prompt) => {
    const ta = document.querySelector(`[data-project-id="${pid}"] [data-testid="agent-prompt"]`) as HTMLTextAreaElement | null;
    if (ta) { ta.value = prompt; ta.dispatchEvent(new Event('input', { bubbles: true })); }
  }, id, text);
  await browser.pause(200);
  await browser.execute((pid) => {
    (document.querySelector(`[data-project-id="${pid}"] [data-testid="agent-submit"]`) as HTMLElement)?.click();
  }, id);
}
async function waitForAgentStatus(id: string, status: string, timeout = 60_000): Promise<void> {
  await browser.waitUntil(async () => (await getAgentStatus(id)) === status,
    { timeout, timeoutMsg: `Agent ${id} did not reach "${status}" in ${timeout}ms` });
}
async function getAgentMessages(id: string): Promise<string> {
  return browser.execute((pid) =>
    document.querySelector(`[data-project-id="${pid}"] [data-testid="agent-messages"]`)?.textContent ?? '', id);
}
async function switchTab(id: string, idx: number): Promise<void> {
  await browser.execute((pid, i) => {
    const tabs = document.querySelector(`[data-project-id="${pid}"]`)?.querySelectorAll('[data-testid="project-tabs"] .ptab');
    if (tabs?.[i]) (tabs[i] as HTMLElement).click();
  }, id, idx);
  await browser.pause(300);
}

// ─── Scenario B4: LLM-judged agent response (requires API key) ──────

describe('Scenario B4 — LLM-Judged Agent Response', () => {
  const SKIP_MSG = 'Skipping — LLM judge not available (no CLI or API key)';

  before(async () => {
    for (const id of await getProjectIds()) await switchTab(id, 0);
  });

  it('should send prompt and get meaningful response', async function () {
    this.timeout(180_000);
    if (!isJudgeAvailable()) { console.log(SKIP_MSG); this.skip(); return; }
    const ids = await getProjectIds();
    if (ids.length < 1) { this.skip(); return; }
    const pid = ids[0];
    await sendPromptInProject(pid, 'List the files in the current directory. Just list them, nothing else.');
    try { await waitForAgentStatus(pid, 'running', 15_000); }
    catch { console.log('Agent did not start'); this.skip(); return; }
    await waitForAgentStatus(pid, 'idle', 120_000);
    const messages = await getAgentMessages(pid);
    const verdict = await assertWithJudge(
      'The output should contain a file listing with at least one filename (like README.md or hello.py), not an error message.',
      messages, { context: 'Agent was asked to list files in a directory containing README.md and hello.py' });
    expect(verdict.pass).toBe(true);
    if (!verdict.pass) console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`);
  });

  it('should produce response with appropriate tool usage', async function () {
    if (!isJudgeAvailable()) { console.log(SKIP_MSG); this.skip(); return; }
    const ids = await getProjectIds();
    if (ids.length < 1) { this.skip(); return; }
    const messages = await getAgentMessages(ids[0]);
    const verdict = await assertWithJudge(
      'The output should show evidence of tool usage (Bash, Read, Glob, etc.) — tool names, commands, or file paths.',
      messages, { context: 'Agent tool calls rendered in collapsible sections with tool name and output' });
    expect(verdict.pass).toBe(true);
    if (!verdict.pass) console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`);
  });

  it('should show tool calls in collapsible groups during execution', async function () {
    if (!isJudgeAvailable()) { console.log(SKIP_MSG); this.skip(); return; }
    const ids = await getProjectIds();
    if (ids.length < 1) { this.skip(); return; }
    const messages = await getAgentMessages(ids[0]);
    const verdict = await assertWithJudge(
      'The output should contain tool call/result pairs as collapsible sections with tool names (Bash, Read, Glob, etc.).',
      messages, { context: 'Tool calls rendered in <details> groups.' });
    expect(verdict.pass).toBe(true);
  });

  it('should display cost after agent completes', async function () {
    const ids = await getProjectIds();
    if (ids.length < 1) return;
    const pid = ids[0];
    const status = await getAgentStatus(pid);
    if (status === 'idle') {
      const hasCost = await browser.execute((id) => {
        return document.querySelector(`[data-project-id="${id}"] .cost-bar, [data-project-id="${id}"] .usage-meter, [data-project-id="${id}"] [data-testid="agent-cost"]`) !== null;
      }, pid);
      expect(typeof hasCost).toBe('boolean');
    }
  });

  it('should show model name used for response', async function () {
    const ids = await getProjectIds();
    if (ids.length < 1) return;
    const pid = ids[0];
    const modelInfo = await browser.execute((id) => {
      const box = document.querySelector(`[data-project-id="${id}"]`);
      const modelEl = box?.querySelector('.model-name, .session-model, [data-testid="agent-model"]');
      const strip = box?.querySelector('.status-strip');
      return (modelEl?.textContent?.trim() ?? '') + (strip?.textContent?.trim() ?? '');
    }, pid);
    expect(typeof modelInfo).toBe('string');
  });
});

// ─── Scenario B5: LLM-judged code generation quality ─────────────────

describe('Scenario B5 — LLM-Judged Code Generation', () => {
  const SKIP_MSG = 'Skipping — LLM judge not available (no CLI or API key)';

  it('should generate valid code when asked', async function () {
    this.timeout(180_000);
    if (!isJudgeAvailable()) { console.log(SKIP_MSG); this.skip(); return; }
    const ids = await getProjectIds();
    if (ids.length < 1) { this.skip(); return; }
    const pid = ids[0];
    await sendPromptInProject(pid, 'Read hello.py and tell me what the greet function does. One sentence answer.');
    try { await waitForAgentStatus(pid, 'running', 15_000); }
    catch { console.log('Agent did not start'); this.skip(); return; }
    await waitForAgentStatus(pid, 'idle', 120_000);
    const messages = await getAgentMessages(pid);
    const verdict = await assertWithJudge(
      'The response should describe the greet function taking a name and returning "Hello, {name}!" in roughly one sentence.',
      messages, { context: 'hello.py contains: def greet(name: str) -> str:\n    return f"Hello, {name}!"' });
    expect(verdict.pass).toBe(true);
    if (!verdict.pass) console.log(`LLM Judge: ${verdict.reasoning} (confidence: ${verdict.confidence})`);
  });

  it('should preserve session messages after tab switch and back', async function () {
    const ids = await getProjectIds();
    if (ids.length < 1) return;
    const pid = ids[0];
    const before = await getAgentMessages(pid);
    await switchTab(pid, 3);
    await browser.pause(500);
    await switchTab(pid, 0);
    await browser.pause(500);
    const after = await getAgentMessages(pid);
    if (before.length > 0) { expect(after).toBe(before); }
  });
});

// ─── Scenario B6: Context tab reflects agent activity ────────────────

describe('Scenario B6 — Context Tab After Agent Activity', () => {
  it('should show token usage in Context tab after agent ran', async () => {
    const ids = await getProjectIds();
    if (ids.length < 1) return;
    const pid = ids[0];
    await switchTab(pid, 2);
    const content = await browser.execute((id) => {
      return document.querySelector(`[data-project-id="${id}"] .context-stats, [data-project-id="${id}"] .token-meter, [data-project-id="${id}"] .stat-value`)?.textContent ?? '';
    }, pid);
    if (content) { expect(content.length).toBeGreaterThan(0); }
    await switchTab(pid, 0);
  });

  it('should show context tab token meter with non-zero tokens after agent activity', async () => {
    const ids = await getProjectIds();
    if (ids.length < 1) return;
    const pid = ids[0];
    await switchTab(pid, 2);
    const tokenData = await browser.execute((id) => {
      const box = document.querySelector(`[data-project-id="${id}"]`);
      const meter = box?.querySelector('.token-meter, .context-meter, [data-testid="token-meter"]');
      const stats = box?.querySelectorAll('.stat-value');
      return { meterExists: meter !== null, statCount: stats?.length ?? 0 };
    }, pid);
    if (tokenData.meterExists || tokenData.statCount > 0) {
      expect(tokenData.statCount).toBeGreaterThan(0);
    }
    await switchTab(pid, 0);
  });

  it('should show file references in context tab after agent reads files', async () => {
    const ids = await getProjectIds();
    if (ids.length < 1) return;
    const pid = ids[0];
    await switchTab(pid, 2);
    const refCount = await browser.execute((id) => {
      const box = document.querySelector(`[data-project-id="${id}"]`);
      const refs = box?.querySelectorAll('.file-ref, .file-reference, [data-testid="file-refs"] li');
      return refs?.length ?? 0;
    }, pid);
    if (refCount > 0) { expect(refCount).toBeGreaterThan(0); }
    await switchTab(pid, 0);
  });
});