// LLM Judge — evaluates test outcomes via Claude. // // Two backends, configurable via LLM_JUDGE_BACKEND env var: // "cli" — Claude CLI (default, no API key needed) // "api" — Anthropic REST API (requires ANTHROPIC_API_KEY) // // CLI backend: spawns `claude` with --output-format text, parses JSON verdict. // API backend: raw fetch to messages API, same JSON verdict parsing. // // Skips gracefully when neither backend is available. import { execFileSync, execSync } from 'node:child_process'; import { existsSync } from 'node:fs'; const MODEL = 'claude-haiku-4-5-20251001'; const API_URL = 'https://api.anthropic.com/v1/messages'; const MAX_TOKENS = 512; // CLI search paths (in order) const CLI_PATHS = [ `${process.env.HOME}/.local/bin/claude`, `${process.env.HOME}/.claude/local/claude`, '/usr/local/bin/claude', '/usr/bin/claude', ]; export type JudgeBackend = 'cli' | 'api'; export interface JudgeVerdict { pass: boolean; reasoning: string; confidence: number; // 0-1 } /** * Find the Claude CLI binary path, or null if not installed. */ function findClaudeCli(): string | null { for (const p of CLI_PATHS) { if (existsSync(p)) return p; } // Fallback: check PATH try { const which = execSync('which claude 2>/dev/null', { encoding: 'utf-8' }).trim(); if (which) return which; } catch { // not found } return null; } /** * Determine which backend to use. * Env var LLM_JUDGE_BACKEND overrides auto-detection. * Auto: CLI if available, then API if key set, else null. */ function resolveBackend(): JudgeBackend | null { const explicit = process.env.LLM_JUDGE_BACKEND?.toLowerCase(); if (explicit === 'cli') return findClaudeCli() ? 'cli' : null; if (explicit === 'api') return process.env.ANTHROPIC_API_KEY ? 'api' : null; // Auto-detect: CLI first, API fallback if (findClaudeCli()) return 'cli'; if (process.env.ANTHROPIC_API_KEY) return 'api'; return null; } /** * Check if the LLM judge is available (CLI installed or API key set). */ export function isJudgeAvailable(): boolean { return resolveBackend() !== null; } /** * Build the prompt for the judge. */ function buildPrompt(criteria: string, actual: string, context?: string): { system: string; user: string } { const system = `You are a test assertion judge for a terminal emulator application called BTerminal. Your job is to evaluate whether actual output from the application meets the given criteria. Respond with EXACTLY this JSON format, nothing else: {"pass": true/false, "reasoning": "brief explanation", "confidence": 0.0-1.0}`; const user = [ '## Criteria', criteria, '', '## Actual Output', actual, ...(context ? ['', '## Additional Context', context] : []), '', 'Does the actual output satisfy the criteria? Respond with JSON only.', ].join('\n'); return { system, user }; } /** * Extract and validate a JudgeVerdict from raw text output. */ function parseVerdict(text: string): JudgeVerdict { const jsonMatch = text.match(/\{[\s\S]*\}/); if (!jsonMatch) { throw new Error(`LLM judge returned non-JSON: ${text}`); } const verdict = JSON.parse(jsonMatch[0]) as JudgeVerdict; if (typeof verdict.pass !== 'boolean') { throw new Error(`LLM judge returned invalid verdict: ${text}`); } verdict.confidence = Number(verdict.confidence) || 0; verdict.reasoning = String(verdict.reasoning || ''); return verdict; } /** * Judge via Claude CLI (spawns subprocess). * Unsets CLAUDECODE to avoid nested session errors. */ async function judgeCli( criteria: string, actual: string, context?: string, ): Promise { const cliPath = findClaudeCli(); if (!cliPath) throw new Error('Claude CLI not found'); const { system, user } = buildPrompt(criteria, actual, context); const output = execFileSync(cliPath, [ '-p', user, '--model', MODEL, '--output-format', 'text', '--system-prompt', system, '--setting-sources', 'user', // skip project CLAUDE.md ], { encoding: 'utf-8', timeout: 60_000, cwd: '/tmp', // avoid loading project CLAUDE.md env: { ...process.env, CLAUDECODE: '' }, maxBuffer: 1024 * 1024, }); return parseVerdict(output); } /** * Judge via Anthropic REST API (raw fetch). */ async function judgeApi( criteria: string, actual: string, context?: string, ): Promise { const apiKey = process.env.ANTHROPIC_API_KEY; if (!apiKey) throw new Error('ANTHROPIC_API_KEY not set'); const { system, user } = buildPrompt(criteria, actual, context); const response = await fetch(API_URL, { method: 'POST', headers: { 'Content-Type': 'application/json', 'x-api-key': apiKey, 'anthropic-version': '2023-06-01', }, body: JSON.stringify({ model: MODEL, max_tokens: MAX_TOKENS, system, messages: [{ role: 'user', content: user }], }), }); if (!response.ok) { const body = await response.text(); throw new Error(`Anthropic API error ${response.status}: ${body}`); } const data = await response.json(); const text = data.content?.[0]?.text ?? ''; return parseVerdict(text); } /** * Ask Claude to evaluate whether `actual` output satisfies `criteria`. * * Uses CLI backend by default, falls back to API. Override with * LLM_JUDGE_BACKEND env var ("cli" or "api"). * * Returns a structured verdict with pass/fail, reasoning, and confidence. * Throws if no backend available or call fails. */ export async function judge( criteria: string, actual: string, context?: string, ): Promise { const backend = resolveBackend(); if (!backend) { throw new Error('LLM judge unavailable — no Claude CLI found and ANTHROPIC_API_KEY not set'); } if (backend === 'cli') { return judgeCli(criteria, actual, context); } return judgeApi(criteria, actual, context); } /** * Convenience: judge with a minimum confidence threshold. * Returns pass=true only if verdict.pass=true AND confidence >= threshold. */ export async function assertWithJudge( criteria: string, actual: string, options: { context?: string; minConfidence?: number } = {}, ): Promise { const { context, minConfidence = 0.7 } = options; const verdict = await judge(criteria, actual, context); if (verdict.pass && verdict.confidence < minConfidence) { verdict.pass = false; verdict.reasoning += ` (confidence ${verdict.confidence} below threshold ${minConfidence})`; } return verdict; }