feat: refactor LLM judge to dual-mode CLI/API and fix config test race

Refactor llm-judge.ts from raw API-only to dual-mode: CLI first
(spawns claude with --output-format text, unsets CLAUDECODE), API
fallback. Backend selectable via LLM_JUDGE_BACKEND env var.

Fix pre-existing race condition in config.rs tests where parallel
test execution caused env var mutations to interfere. Added static
Mutex to serialize env-mutating tests.
This commit is contained in:
Hibryda 2026-03-12 06:35:04 +01:00
parent 05c9e1abbb
commit a3185656eb
4 changed files with 169 additions and 42 deletions

View file

@ -109,9 +109,15 @@ impl AppConfig {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use std::sync::Mutex;
// Serialize all tests that mutate env vars to prevent race conditions.
// Rust runs tests in parallel; set_var/remove_var are process-global.
static ENV_LOCK: Mutex<()> = Mutex::new(());
#[test] #[test]
fn test_production_paths_use_dirs() { fn test_production_paths_use_dirs() {
let _lock = ENV_LOCK.lock().unwrap();
// Without BTERMINAL_TEST=1, paths should use dirs:: defaults // Without BTERMINAL_TEST=1, paths should use dirs:: defaults
std::env::remove_var("BTERMINAL_TEST"); std::env::remove_var("BTERMINAL_TEST");
std::env::remove_var("BTERMINAL_TEST_DATA_DIR"); std::env::remove_var("BTERMINAL_TEST_DATA_DIR");
@ -129,6 +135,7 @@ mod tests {
#[test] #[test]
fn test_btmsg_db_path() { fn test_btmsg_db_path() {
let _lock = ENV_LOCK.lock().unwrap();
std::env::remove_var("BTERMINAL_TEST"); std::env::remove_var("BTERMINAL_TEST");
let config = AppConfig::from_env(); let config = AppConfig::from_env();
let path = config.btmsg_db_path(); let path = config.btmsg_db_path();
@ -138,6 +145,7 @@ mod tests {
#[test] #[test]
fn test_groups_json_path() { fn test_groups_json_path() {
let _lock = ENV_LOCK.lock().unwrap();
std::env::remove_var("BTERMINAL_TEST"); std::env::remove_var("BTERMINAL_TEST");
let config = AppConfig::from_env(); let config = AppConfig::from_env();
let path = config.groups_json_path(); let path = config.groups_json_path();
@ -146,6 +154,7 @@ mod tests {
#[test] #[test]
fn test_test_mode_uses_overrides() { fn test_test_mode_uses_overrides() {
let _lock = ENV_LOCK.lock().unwrap();
std::env::set_var("BTERMINAL_TEST", "1"); std::env::set_var("BTERMINAL_TEST", "1");
std::env::set_var("BTERMINAL_TEST_DATA_DIR", "/tmp/bt-test-data"); std::env::set_var("BTERMINAL_TEST_DATA_DIR", "/tmp/bt-test-data");
std::env::set_var("BTERMINAL_TEST_CONFIG_DIR", "/tmp/bt-test-config"); std::env::set_var("BTERMINAL_TEST_CONFIG_DIR", "/tmp/bt-test-config");
@ -168,6 +177,7 @@ mod tests {
#[test] #[test]
fn test_test_mode_without_overrides_uses_defaults() { fn test_test_mode_without_overrides_uses_defaults() {
let _lock = ENV_LOCK.lock().unwrap();
std::env::set_var("BTERMINAL_TEST", "1"); std::env::set_var("BTERMINAL_TEST", "1");
std::env::remove_var("BTERMINAL_TEST_DATA_DIR"); std::env::remove_var("BTERMINAL_TEST_DATA_DIR");
std::env::remove_var("BTERMINAL_TEST_CONFIG_DIR"); std::env::remove_var("BTERMINAL_TEST_CONFIG_DIR");
@ -183,6 +193,7 @@ mod tests {
#[test] #[test]
fn test_test_mode_memora_in_data_dir() { fn test_test_mode_memora_in_data_dir() {
let _lock = ENV_LOCK.lock().unwrap();
std::env::set_var("BTERMINAL_TEST", "1"); std::env::set_var("BTERMINAL_TEST", "1");
std::env::set_var("BTERMINAL_TEST_DATA_DIR", "/tmp/bt-test-data"); std::env::set_var("BTERMINAL_TEST_DATA_DIR", "/tmp/bt-test-data");

View file

@ -1,11 +1,31 @@
// LLM Judge — evaluates test outcomes via Claude API // LLM Judge — evaluates test outcomes via Claude.
// Uses raw fetch (no SDK dep). Requires ANTHROPIC_API_KEY env var. //
// Skips gracefully when API key is absent. // Two backends, configurable via LLM_JUDGE_BACKEND env var:
// "cli" — Claude CLI (default, no API key needed)
// "api" — Anthropic REST API (requires ANTHROPIC_API_KEY)
//
// CLI backend: spawns `claude` with --output-format text, parses JSON verdict.
// API backend: raw fetch to messages API, same JSON verdict parsing.
//
// Skips gracefully when neither backend is available.
import { execFileSync, execSync } from 'node:child_process';
import { existsSync } from 'node:fs';
const MODEL = 'claude-haiku-4-5-20251001';
const API_URL = 'https://api.anthropic.com/v1/messages'; const API_URL = 'https://api.anthropic.com/v1/messages';
const MODEL = 'claude-haiku-4-5-20251001'; // Fast + cheap for test judging
const MAX_TOKENS = 512; const MAX_TOKENS = 512;
// CLI search paths (in order)
const CLI_PATHS = [
`${process.env.HOME}/.local/bin/claude`,
`${process.env.HOME}/.claude/local/claude`,
'/usr/local/bin/claude',
'/usr/bin/claude',
];
export type JudgeBackend = 'cli' | 'api';
export interface JudgeVerdict { export interface JudgeVerdict {
pass: boolean; pass: boolean;
reasoning: string; reasoning: string;
@ -13,34 +33,55 @@ export interface JudgeVerdict {
} }
/** /**
* Check if the LLM judge is available (API key set). * Find the Claude CLI binary path, or null if not installed.
*/ */
export function isJudgeAvailable(): boolean { function findClaudeCli(): string | null {
return !!process.env.ANTHROPIC_API_KEY; for (const p of CLI_PATHS) {
if (existsSync(p)) return p;
}
// Fallback: check PATH
try {
const which = execSync('which claude 2>/dev/null', { encoding: 'utf-8' }).trim();
if (which) return which;
} catch {
// not found
}
return null;
} }
/** /**
* Ask Claude to evaluate whether `actual` output satisfies `criteria`. * Determine which backend to use.
* * Env var LLM_JUDGE_BACKEND overrides auto-detection.
* Returns a structured verdict with pass/fail, reasoning, and confidence. * Auto: CLI if available, then API if key set, else null.
* Throws if API call fails (caller should catch and handle).
*/ */
export async function judge( function resolveBackend(): JudgeBackend | null {
criteria: string, const explicit = process.env.LLM_JUDGE_BACKEND?.toLowerCase();
actual: string, if (explicit === 'cli') return findClaudeCli() ? 'cli' : null;
context?: string, if (explicit === 'api') return process.env.ANTHROPIC_API_KEY ? 'api' : null;
): Promise<JudgeVerdict> {
const apiKey = process.env.ANTHROPIC_API_KEY; // Auto-detect: CLI first, API fallback
if (!apiKey) { if (findClaudeCli()) return 'cli';
throw new Error('ANTHROPIC_API_KEY not set — LLM judge unavailable'); if (process.env.ANTHROPIC_API_KEY) return 'api';
return null;
} }
const systemPrompt = `You are a test assertion judge for a terminal emulator application called BTerminal. /**
* Check if the LLM judge is available (CLI installed or API key set).
*/
export function isJudgeAvailable(): boolean {
return resolveBackend() !== null;
}
/**
* Build the prompt for the judge.
*/
function buildPrompt(criteria: string, actual: string, context?: string): { system: string; user: string } {
const system = `You are a test assertion judge for a terminal emulator application called BTerminal.
Your job is to evaluate whether actual output from the application meets the given criteria. Your job is to evaluate whether actual output from the application meets the given criteria.
Respond with EXACTLY this JSON format, nothing else: Respond with EXACTLY this JSON format, nothing else:
{"pass": true/false, "reasoning": "brief explanation", "confidence": 0.0-1.0}`; {"pass": true/false, "reasoning": "brief explanation", "confidence": 0.0-1.0}`;
const userPrompt = [ const user = [
'## Criteria', '## Criteria',
criteria, criteria,
'', '',
@ -51,6 +92,71 @@ Respond with EXACTLY this JSON format, nothing else:
'Does the actual output satisfy the criteria? Respond with JSON only.', 'Does the actual output satisfy the criteria? Respond with JSON only.',
].join('\n'); ].join('\n');
return { system, user };
}
/**
* Extract and validate a JudgeVerdict from raw text output.
*/
function parseVerdict(text: string): JudgeVerdict {
const jsonMatch = text.match(/\{[\s\S]*\}/);
if (!jsonMatch) {
throw new Error(`LLM judge returned non-JSON: ${text}`);
}
const verdict = JSON.parse(jsonMatch[0]) as JudgeVerdict;
if (typeof verdict.pass !== 'boolean') {
throw new Error(`LLM judge returned invalid verdict: ${text}`);
}
verdict.confidence = Number(verdict.confidence) || 0;
verdict.reasoning = String(verdict.reasoning || '');
return verdict;
}
/**
* Judge via Claude CLI (spawns subprocess).
* Unsets CLAUDECODE to avoid nested session errors.
*/
async function judgeCli(
criteria: string,
actual: string,
context?: string,
): Promise<JudgeVerdict> {
const cliPath = findClaudeCli();
if (!cliPath) throw new Error('Claude CLI not found');
const { system, user } = buildPrompt(criteria, actual, context);
const fullPrompt = `${system}\n\n${user}`;
const output = execFileSync(cliPath, [
'-p', fullPrompt,
'--model', MODEL,
'--output-format', 'text',
], {
encoding: 'utf-8',
timeout: 60_000,
env: { ...process.env, CLAUDECODE: '' },
maxBuffer: 1024 * 1024,
});
return parseVerdict(output);
}
/**
* Judge via Anthropic REST API (raw fetch).
*/
async function judgeApi(
criteria: string,
actual: string,
context?: string,
): Promise<JudgeVerdict> {
const apiKey = process.env.ANTHROPIC_API_KEY;
if (!apiKey) throw new Error('ANTHROPIC_API_KEY not set');
const { system, user } = buildPrompt(criteria, actual, context);
const response = await fetch(API_URL, { const response = await fetch(API_URL, {
method: 'POST', method: 'POST',
headers: { headers: {
@ -61,8 +167,8 @@ Respond with EXACTLY this JSON format, nothing else:
body: JSON.stringify({ body: JSON.stringify({
model: MODEL, model: MODEL,
max_tokens: MAX_TOKENS, max_tokens: MAX_TOKENS,
system: systemPrompt, system,
messages: [{ role: 'user', content: userPrompt }], messages: [{ role: 'user', content: user }],
}), }),
}); });
@ -74,27 +180,37 @@ Respond with EXACTLY this JSON format, nothing else:
const data = await response.json(); const data = await response.json();
const text = data.content?.[0]?.text ?? ''; const text = data.content?.[0]?.text ?? '';
// Extract JSON from response (may have markdown fences) return parseVerdict(text);
const jsonMatch = text.match(/\{[\s\S]*\}/);
if (!jsonMatch) {
throw new Error(`LLM judge returned non-JSON: ${text}`);
} }
const verdict = JSON.parse(jsonMatch[0]) as JudgeVerdict; /**
* Ask Claude to evaluate whether `actual` output satisfies `criteria`.
// Validate structure *
if (typeof verdict.pass !== 'boolean') { * Uses CLI backend by default, falls back to API. Override with
throw new Error(`LLM judge returned invalid verdict: ${text}`); * LLM_JUDGE_BACKEND env var ("cli" or "api").
*
* Returns a structured verdict with pass/fail, reasoning, and confidence.
* Throws if no backend available or call fails.
*/
export async function judge(
criteria: string,
actual: string,
context?: string,
): Promise<JudgeVerdict> {
const backend = resolveBackend();
if (!backend) {
throw new Error('LLM judge unavailable — no Claude CLI found and ANTHROPIC_API_KEY not set');
} }
verdict.confidence = Number(verdict.confidence) || 0;
verdict.reasoning = String(verdict.reasoning || '');
return verdict; if (backend === 'cli') {
return judgeCli(criteria, actual, context);
}
return judgeApi(criteria, actual, context);
} }
/** /**
* Convenience: judge with a minimum confidence threshold. * Convenience: judge with a minimum confidence threshold.
* Returns true only if pass=true AND confidence >= threshold. * Returns pass=true only if verdict.pass=true AND confidence >= threshold.
*/ */
export async function assertWithJudge( export async function assertWithJudge(
criteria: string, criteria: string,

View file

@ -217,7 +217,7 @@ describe('Scenario B3 — Status Bar Fleet State', () => {
// ─── Scenario B4: LLM-judged agent response (requires API key) ────── // ─── Scenario B4: LLM-judged agent response (requires API key) ──────
describe('Scenario B4 — LLM-Judged Agent Response', () => { describe('Scenario B4 — LLM-Judged Agent Response', () => {
const SKIP_MSG = 'Skipping — ANTHROPIC_API_KEY not set'; const SKIP_MSG = 'Skipping — LLM judge not available (no CLI or API key)';
it('should send prompt and get meaningful response', async function () { it('should send prompt and get meaningful response', async function () {
if (!isJudgeAvailable()) { if (!isJudgeAvailable()) {
@ -297,7 +297,7 @@ describe('Scenario B4 — LLM-Judged Agent Response', () => {
// ─── Scenario B5: LLM-judged code generation quality ───────────────── // ─── Scenario B5: LLM-judged code generation quality ─────────────────
describe('Scenario B5 — LLM-Judged Code Generation', () => { describe('Scenario B5 — LLM-Judged Code Generation', () => {
const SKIP_MSG = 'Skipping — ANTHROPIC_API_KEY not set'; const SKIP_MSG = 'Skipping — LLM judge not available (no CLI or API key)';
it('should generate valid code when asked', async function () { it('should generate valid code when asked', async function () {
if (!isJudgeAvailable()) { if (!isJudgeAvailable()) {

View file

@ -559,7 +559,7 @@ describe('Scenario C9 — Files Tab & Code Editor', () => {
describe('Scenario C10 — LLM-Judged Settings Completeness', () => { describe('Scenario C10 — LLM-Judged Settings Completeness', () => {
it('should have comprehensive settings panel', async function () { it('should have comprehensive settings panel', async function () {
if (!isJudgeAvailable()) { if (!isJudgeAvailable()) {
console.log('Skipping — ANTHROPIC_API_KEY not set'); console.log('Skipping — LLM judge not available (no CLI or API key)');
this.skip(); this.skip();
return; return;
} }
@ -597,7 +597,7 @@ describe('Scenario C10 — LLM-Judged Settings Completeness', () => {
describe('Scenario C11 — LLM-Judged Status Bar Completeness', () => { describe('Scenario C11 — LLM-Judged Status Bar Completeness', () => {
it('should render a comprehensive status bar', async function () { it('should render a comprehensive status bar', async function () {
if (!isJudgeAvailable()) { if (!isJudgeAvailable()) {
console.log('Skipping — ANTHROPIC_API_KEY not set'); console.log('Skipping — LLM judge not available (no CLI or API key)');
this.skip(); this.skip();
return; return;
} }