diff --git a/v2/bterminal-core/src/config.rs b/v2/bterminal-core/src/config.rs index effa319..0940841 100644 --- a/v2/bterminal-core/src/config.rs +++ b/v2/bterminal-core/src/config.rs @@ -109,9 +109,15 @@ impl AppConfig { #[cfg(test)] mod tests { use super::*; + use std::sync::Mutex; + + // Serialize all tests that mutate env vars to prevent race conditions. + // Rust runs tests in parallel; set_var/remove_var are process-global. + static ENV_LOCK: Mutex<()> = Mutex::new(()); #[test] fn test_production_paths_use_dirs() { + let _lock = ENV_LOCK.lock().unwrap(); // Without BTERMINAL_TEST=1, paths should use dirs:: defaults std::env::remove_var("BTERMINAL_TEST"); std::env::remove_var("BTERMINAL_TEST_DATA_DIR"); @@ -129,6 +135,7 @@ mod tests { #[test] fn test_btmsg_db_path() { + let _lock = ENV_LOCK.lock().unwrap(); std::env::remove_var("BTERMINAL_TEST"); let config = AppConfig::from_env(); let path = config.btmsg_db_path(); @@ -138,6 +145,7 @@ mod tests { #[test] fn test_groups_json_path() { + let _lock = ENV_LOCK.lock().unwrap(); std::env::remove_var("BTERMINAL_TEST"); let config = AppConfig::from_env(); let path = config.groups_json_path(); @@ -146,6 +154,7 @@ mod tests { #[test] fn test_test_mode_uses_overrides() { + let _lock = ENV_LOCK.lock().unwrap(); std::env::set_var("BTERMINAL_TEST", "1"); std::env::set_var("BTERMINAL_TEST_DATA_DIR", "/tmp/bt-test-data"); std::env::set_var("BTERMINAL_TEST_CONFIG_DIR", "/tmp/bt-test-config"); @@ -168,6 +177,7 @@ mod tests { #[test] fn test_test_mode_without_overrides_uses_defaults() { + let _lock = ENV_LOCK.lock().unwrap(); std::env::set_var("BTERMINAL_TEST", "1"); std::env::remove_var("BTERMINAL_TEST_DATA_DIR"); std::env::remove_var("BTERMINAL_TEST_CONFIG_DIR"); @@ -183,6 +193,7 @@ mod tests { #[test] fn test_test_mode_memora_in_data_dir() { + let _lock = ENV_LOCK.lock().unwrap(); std::env::set_var("BTERMINAL_TEST", "1"); std::env::set_var("BTERMINAL_TEST_DATA_DIR", "/tmp/bt-test-data"); diff --git a/v2/tests/e2e/llm-judge.ts b/v2/tests/e2e/llm-judge.ts index c2f6a18..0b1d237 100644 --- a/v2/tests/e2e/llm-judge.ts +++ b/v2/tests/e2e/llm-judge.ts @@ -1,11 +1,31 @@ -// LLM Judge — evaluates test outcomes via Claude API -// Uses raw fetch (no SDK dep). Requires ANTHROPIC_API_KEY env var. -// Skips gracefully when API key is absent. +// LLM Judge — evaluates test outcomes via Claude. +// +// Two backends, configurable via LLM_JUDGE_BACKEND env var: +// "cli" — Claude CLI (default, no API key needed) +// "api" — Anthropic REST API (requires ANTHROPIC_API_KEY) +// +// CLI backend: spawns `claude` with --output-format text, parses JSON verdict. +// API backend: raw fetch to messages API, same JSON verdict parsing. +// +// Skips gracefully when neither backend is available. +import { execFileSync, execSync } from 'node:child_process'; +import { existsSync } from 'node:fs'; + +const MODEL = 'claude-haiku-4-5-20251001'; const API_URL = 'https://api.anthropic.com/v1/messages'; -const MODEL = 'claude-haiku-4-5-20251001'; // Fast + cheap for test judging const MAX_TOKENS = 512; +// CLI search paths (in order) +const CLI_PATHS = [ + `${process.env.HOME}/.local/bin/claude`, + `${process.env.HOME}/.claude/local/claude`, + '/usr/local/bin/claude', + '/usr/bin/claude', +]; + +export type JudgeBackend = 'cli' | 'api'; + export interface JudgeVerdict { pass: boolean; reasoning: string; @@ -13,34 +33,55 @@ export interface JudgeVerdict { } /** - * Check if the LLM judge is available (API key set). + * Find the Claude CLI binary path, or null if not installed. */ -export function isJudgeAvailable(): boolean { - return !!process.env.ANTHROPIC_API_KEY; +function findClaudeCli(): string | null { + for (const p of CLI_PATHS) { + if (existsSync(p)) return p; + } + // Fallback: check PATH + try { + const which = execSync('which claude 2>/dev/null', { encoding: 'utf-8' }).trim(); + if (which) return which; + } catch { + // not found + } + return null; } /** - * Ask Claude to evaluate whether `actual` output satisfies `criteria`. - * - * Returns a structured verdict with pass/fail, reasoning, and confidence. - * Throws if API call fails (caller should catch and handle). + * Determine which backend to use. + * Env var LLM_JUDGE_BACKEND overrides auto-detection. + * Auto: CLI if available, then API if key set, else null. */ -export async function judge( - criteria: string, - actual: string, - context?: string, -): Promise { - const apiKey = process.env.ANTHROPIC_API_KEY; - if (!apiKey) { - throw new Error('ANTHROPIC_API_KEY not set — LLM judge unavailable'); - } +function resolveBackend(): JudgeBackend | null { + const explicit = process.env.LLM_JUDGE_BACKEND?.toLowerCase(); + if (explicit === 'cli') return findClaudeCli() ? 'cli' : null; + if (explicit === 'api') return process.env.ANTHROPIC_API_KEY ? 'api' : null; - const systemPrompt = `You are a test assertion judge for a terminal emulator application called BTerminal. + // Auto-detect: CLI first, API fallback + if (findClaudeCli()) return 'cli'; + if (process.env.ANTHROPIC_API_KEY) return 'api'; + return null; +} + +/** + * Check if the LLM judge is available (CLI installed or API key set). + */ +export function isJudgeAvailable(): boolean { + return resolveBackend() !== null; +} + +/** + * Build the prompt for the judge. + */ +function buildPrompt(criteria: string, actual: string, context?: string): { system: string; user: string } { + const system = `You are a test assertion judge for a terminal emulator application called BTerminal. Your job is to evaluate whether actual output from the application meets the given criteria. Respond with EXACTLY this JSON format, nothing else: {"pass": true/false, "reasoning": "brief explanation", "confidence": 0.0-1.0}`; - const userPrompt = [ + const user = [ '## Criteria', criteria, '', @@ -51,6 +92,71 @@ Respond with EXACTLY this JSON format, nothing else: 'Does the actual output satisfy the criteria? Respond with JSON only.', ].join('\n'); + return { system, user }; +} + +/** + * Extract and validate a JudgeVerdict from raw text output. + */ +function parseVerdict(text: string): JudgeVerdict { + const jsonMatch = text.match(/\{[\s\S]*\}/); + if (!jsonMatch) { + throw new Error(`LLM judge returned non-JSON: ${text}`); + } + + const verdict = JSON.parse(jsonMatch[0]) as JudgeVerdict; + + if (typeof verdict.pass !== 'boolean') { + throw new Error(`LLM judge returned invalid verdict: ${text}`); + } + verdict.confidence = Number(verdict.confidence) || 0; + verdict.reasoning = String(verdict.reasoning || ''); + + return verdict; +} + +/** + * Judge via Claude CLI (spawns subprocess). + * Unsets CLAUDECODE to avoid nested session errors. + */ +async function judgeCli( + criteria: string, + actual: string, + context?: string, +): Promise { + const cliPath = findClaudeCli(); + if (!cliPath) throw new Error('Claude CLI not found'); + + const { system, user } = buildPrompt(criteria, actual, context); + const fullPrompt = `${system}\n\n${user}`; + + const output = execFileSync(cliPath, [ + '-p', fullPrompt, + '--model', MODEL, + '--output-format', 'text', + ], { + encoding: 'utf-8', + timeout: 60_000, + env: { ...process.env, CLAUDECODE: '' }, + maxBuffer: 1024 * 1024, + }); + + return parseVerdict(output); +} + +/** + * Judge via Anthropic REST API (raw fetch). + */ +async function judgeApi( + criteria: string, + actual: string, + context?: string, +): Promise { + const apiKey = process.env.ANTHROPIC_API_KEY; + if (!apiKey) throw new Error('ANTHROPIC_API_KEY not set'); + + const { system, user } = buildPrompt(criteria, actual, context); + const response = await fetch(API_URL, { method: 'POST', headers: { @@ -61,8 +167,8 @@ Respond with EXACTLY this JSON format, nothing else: body: JSON.stringify({ model: MODEL, max_tokens: MAX_TOKENS, - system: systemPrompt, - messages: [{ role: 'user', content: userPrompt }], + system, + messages: [{ role: 'user', content: user }], }), }); @@ -74,27 +180,37 @@ Respond with EXACTLY this JSON format, nothing else: const data = await response.json(); const text = data.content?.[0]?.text ?? ''; - // Extract JSON from response (may have markdown fences) - const jsonMatch = text.match(/\{[\s\S]*\}/); - if (!jsonMatch) { - throw new Error(`LLM judge returned non-JSON: ${text}`); + return parseVerdict(text); +} + +/** + * Ask Claude to evaluate whether `actual` output satisfies `criteria`. + * + * Uses CLI backend by default, falls back to API. Override with + * LLM_JUDGE_BACKEND env var ("cli" or "api"). + * + * Returns a structured verdict with pass/fail, reasoning, and confidence. + * Throws if no backend available or call fails. + */ +export async function judge( + criteria: string, + actual: string, + context?: string, +): Promise { + const backend = resolveBackend(); + if (!backend) { + throw new Error('LLM judge unavailable — no Claude CLI found and ANTHROPIC_API_KEY not set'); } - const verdict = JSON.parse(jsonMatch[0]) as JudgeVerdict; - - // Validate structure - if (typeof verdict.pass !== 'boolean') { - throw new Error(`LLM judge returned invalid verdict: ${text}`); + if (backend === 'cli') { + return judgeCli(criteria, actual, context); } - verdict.confidence = Number(verdict.confidence) || 0; - verdict.reasoning = String(verdict.reasoning || ''); - - return verdict; + return judgeApi(criteria, actual, context); } /** * Convenience: judge with a minimum confidence threshold. - * Returns true only if pass=true AND confidence >= threshold. + * Returns pass=true only if verdict.pass=true AND confidence >= threshold. */ export async function assertWithJudge( criteria: string, diff --git a/v2/tests/e2e/specs/phase-b.test.ts b/v2/tests/e2e/specs/phase-b.test.ts index c975c41..722cad3 100644 --- a/v2/tests/e2e/specs/phase-b.test.ts +++ b/v2/tests/e2e/specs/phase-b.test.ts @@ -217,7 +217,7 @@ describe('Scenario B3 — Status Bar Fleet State', () => { // ─── Scenario B4: LLM-judged agent response (requires API key) ────── describe('Scenario B4 — LLM-Judged Agent Response', () => { - const SKIP_MSG = 'Skipping — ANTHROPIC_API_KEY not set'; + const SKIP_MSG = 'Skipping — LLM judge not available (no CLI or API key)'; it('should send prompt and get meaningful response', async function () { if (!isJudgeAvailable()) { @@ -297,7 +297,7 @@ describe('Scenario B4 — LLM-Judged Agent Response', () => { // ─── Scenario B5: LLM-judged code generation quality ───────────────── describe('Scenario B5 — LLM-Judged Code Generation', () => { - const SKIP_MSG = 'Skipping — ANTHROPIC_API_KEY not set'; + const SKIP_MSG = 'Skipping — LLM judge not available (no CLI or API key)'; it('should generate valid code when asked', async function () { if (!isJudgeAvailable()) { diff --git a/v2/tests/e2e/specs/phase-c.test.ts b/v2/tests/e2e/specs/phase-c.test.ts index 244c813..00771d0 100644 --- a/v2/tests/e2e/specs/phase-c.test.ts +++ b/v2/tests/e2e/specs/phase-c.test.ts @@ -559,7 +559,7 @@ describe('Scenario C9 — Files Tab & Code Editor', () => { describe('Scenario C10 — LLM-Judged Settings Completeness', () => { it('should have comprehensive settings panel', async function () { if (!isJudgeAvailable()) { - console.log('Skipping — ANTHROPIC_API_KEY not set'); + console.log('Skipping — LLM judge not available (no CLI or API key)'); this.skip(); return; } @@ -597,7 +597,7 @@ describe('Scenario C10 — LLM-Judged Settings Completeness', () => { describe('Scenario C11 — LLM-Judged Status Bar Completeness', () => { it('should render a comprehensive status bar', async function () { if (!isJudgeAvailable()) { - console.log('Skipping — ANTHROPIC_API_KEY not set'); + console.log('Skipping — LLM judge not available (no CLI or API key)'); this.skip(); return; }