feat(e2e): add Phase B scenarios with LLM-judged assertions and multi-project tests

Adds 6 new E2E scenarios in phase-b.test.ts covering multi-project grid
rendering, independent tab switching, status bar fleet state, and
LLM-judged agent response quality evaluation via Claude API.
Includes llm-judge.ts helper (raw Anthropic API fetch, haiku-4-5,
structured verdicts with confidence thresholds).
This commit is contained in:
Hibryda 2026-03-12 03:07:38 +01:00
parent c4c673a4b0
commit 5e4357e4ac
3 changed files with 469 additions and 0 deletions

113
v2/tests/e2e/llm-judge.ts Normal file
View file

@ -0,0 +1,113 @@
// LLM Judge — evaluates test outcomes via Claude API
// Uses raw fetch (no SDK dep). Requires ANTHROPIC_API_KEY env var.
// Skips gracefully when API key is absent.
const API_URL = 'https://api.anthropic.com/v1/messages';
const MODEL = 'claude-haiku-4-5-20251001'; // Fast + cheap for test judging
const MAX_TOKENS = 512;
export interface JudgeVerdict {
pass: boolean;
reasoning: string;
confidence: number; // 0-1
}
/**
* Check if the LLM judge is available (API key set).
*/
export function isJudgeAvailable(): boolean {
return !!process.env.ANTHROPIC_API_KEY;
}
/**
* Ask Claude to evaluate whether `actual` output satisfies `criteria`.
*
* Returns a structured verdict with pass/fail, reasoning, and confidence.
* Throws if API call fails (caller should catch and handle).
*/
export async function judge(
criteria: string,
actual: string,
context?: string,
): Promise<JudgeVerdict> {
const apiKey = process.env.ANTHROPIC_API_KEY;
if (!apiKey) {
throw new Error('ANTHROPIC_API_KEY not set — LLM judge unavailable');
}
const systemPrompt = `You are a test assertion judge for a terminal emulator application called BTerminal.
Your job is to evaluate whether actual output from the application meets the given criteria.
Respond with EXACTLY this JSON format, nothing else:
{"pass": true/false, "reasoning": "brief explanation", "confidence": 0.0-1.0}`;
const userPrompt = [
'## Criteria',
criteria,
'',
'## Actual Output',
actual,
...(context ? ['', '## Additional Context', context] : []),
'',
'Does the actual output satisfy the criteria? Respond with JSON only.',
].join('\n');
const response = await fetch(API_URL, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'x-api-key': apiKey,
'anthropic-version': '2023-06-01',
},
body: JSON.stringify({
model: MODEL,
max_tokens: MAX_TOKENS,
system: systemPrompt,
messages: [{ role: 'user', content: userPrompt }],
}),
});
if (!response.ok) {
const body = await response.text();
throw new Error(`Anthropic API error ${response.status}: ${body}`);
}
const data = await response.json();
const text = data.content?.[0]?.text ?? '';
// Extract JSON from response (may have markdown fences)
const jsonMatch = text.match(/\{[\s\S]*\}/);
if (!jsonMatch) {
throw new Error(`LLM judge returned non-JSON: ${text}`);
}
const verdict = JSON.parse(jsonMatch[0]) as JudgeVerdict;
// Validate structure
if (typeof verdict.pass !== 'boolean') {
throw new Error(`LLM judge returned invalid verdict: ${text}`);
}
verdict.confidence = Number(verdict.confidence) || 0;
verdict.reasoning = String(verdict.reasoning || '');
return verdict;
}
/**
* Convenience: judge with a minimum confidence threshold.
* Returns true only if pass=true AND confidence >= threshold.
*/
export async function assertWithJudge(
criteria: string,
actual: string,
options: { context?: string; minConfidence?: number } = {},
): Promise<JudgeVerdict> {
const { context, minConfidence = 0.7 } = options;
const verdict = await judge(criteria, actual, context);
if (verdict.pass && verdict.confidence < minConfidence) {
verdict.pass = false;
verdict.reasoning += ` (confidence ${verdict.confidence} below threshold ${minConfidence})`;
}
return verdict;
}