feat: refactor LLM judge to dual-mode CLI/API and fix config test race
Refactor llm-judge.ts from raw API-only to dual-mode: CLI first (spawns claude with --output-format text, unsets CLAUDECODE), API fallback. Backend selectable via LLM_JUDGE_BACKEND env var. Fix pre-existing race condition in config.rs tests where parallel test execution caused env var mutations to interfere. Added static Mutex to serialize env-mutating tests.
This commit is contained in:
parent
05c9e1abbb
commit
a3185656eb
4 changed files with 169 additions and 42 deletions
|
|
@ -109,9 +109,15 @@ impl AppConfig {
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::Mutex;
|
||||
|
||||
// Serialize all tests that mutate env vars to prevent race conditions.
|
||||
// Rust runs tests in parallel; set_var/remove_var are process-global.
|
||||
static ENV_LOCK: Mutex<()> = Mutex::new(());
|
||||
|
||||
#[test]
|
||||
fn test_production_paths_use_dirs() {
|
||||
let _lock = ENV_LOCK.lock().unwrap();
|
||||
// Without BTERMINAL_TEST=1, paths should use dirs:: defaults
|
||||
std::env::remove_var("BTERMINAL_TEST");
|
||||
std::env::remove_var("BTERMINAL_TEST_DATA_DIR");
|
||||
|
|
@ -129,6 +135,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_btmsg_db_path() {
|
||||
let _lock = ENV_LOCK.lock().unwrap();
|
||||
std::env::remove_var("BTERMINAL_TEST");
|
||||
let config = AppConfig::from_env();
|
||||
let path = config.btmsg_db_path();
|
||||
|
|
@ -138,6 +145,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_groups_json_path() {
|
||||
let _lock = ENV_LOCK.lock().unwrap();
|
||||
std::env::remove_var("BTERMINAL_TEST");
|
||||
let config = AppConfig::from_env();
|
||||
let path = config.groups_json_path();
|
||||
|
|
@ -146,6 +154,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_test_mode_uses_overrides() {
|
||||
let _lock = ENV_LOCK.lock().unwrap();
|
||||
std::env::set_var("BTERMINAL_TEST", "1");
|
||||
std::env::set_var("BTERMINAL_TEST_DATA_DIR", "/tmp/bt-test-data");
|
||||
std::env::set_var("BTERMINAL_TEST_CONFIG_DIR", "/tmp/bt-test-config");
|
||||
|
|
@ -168,6 +177,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_test_mode_without_overrides_uses_defaults() {
|
||||
let _lock = ENV_LOCK.lock().unwrap();
|
||||
std::env::set_var("BTERMINAL_TEST", "1");
|
||||
std::env::remove_var("BTERMINAL_TEST_DATA_DIR");
|
||||
std::env::remove_var("BTERMINAL_TEST_CONFIG_DIR");
|
||||
|
|
@ -183,6 +193,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_test_mode_memora_in_data_dir() {
|
||||
let _lock = ENV_LOCK.lock().unwrap();
|
||||
std::env::set_var("BTERMINAL_TEST", "1");
|
||||
std::env::set_var("BTERMINAL_TEST_DATA_DIR", "/tmp/bt-test-data");
|
||||
|
||||
|
|
|
|||
|
|
@ -1,11 +1,31 @@
|
|||
// LLM Judge — evaluates test outcomes via Claude API
|
||||
// Uses raw fetch (no SDK dep). Requires ANTHROPIC_API_KEY env var.
|
||||
// Skips gracefully when API key is absent.
|
||||
// LLM Judge — evaluates test outcomes via Claude.
|
||||
//
|
||||
// Two backends, configurable via LLM_JUDGE_BACKEND env var:
|
||||
// "cli" — Claude CLI (default, no API key needed)
|
||||
// "api" — Anthropic REST API (requires ANTHROPIC_API_KEY)
|
||||
//
|
||||
// CLI backend: spawns `claude` with --output-format text, parses JSON verdict.
|
||||
// API backend: raw fetch to messages API, same JSON verdict parsing.
|
||||
//
|
||||
// Skips gracefully when neither backend is available.
|
||||
|
||||
import { execFileSync, execSync } from 'node:child_process';
|
||||
import { existsSync } from 'node:fs';
|
||||
|
||||
const MODEL = 'claude-haiku-4-5-20251001';
|
||||
const API_URL = 'https://api.anthropic.com/v1/messages';
|
||||
const MODEL = 'claude-haiku-4-5-20251001'; // Fast + cheap for test judging
|
||||
const MAX_TOKENS = 512;
|
||||
|
||||
// CLI search paths (in order)
|
||||
const CLI_PATHS = [
|
||||
`${process.env.HOME}/.local/bin/claude`,
|
||||
`${process.env.HOME}/.claude/local/claude`,
|
||||
'/usr/local/bin/claude',
|
||||
'/usr/bin/claude',
|
||||
];
|
||||
|
||||
export type JudgeBackend = 'cli' | 'api';
|
||||
|
||||
export interface JudgeVerdict {
|
||||
pass: boolean;
|
||||
reasoning: string;
|
||||
|
|
@ -13,34 +33,55 @@ export interface JudgeVerdict {
|
|||
}
|
||||
|
||||
/**
|
||||
* Check if the LLM judge is available (API key set).
|
||||
* Find the Claude CLI binary path, or null if not installed.
|
||||
*/
|
||||
export function isJudgeAvailable(): boolean {
|
||||
return !!process.env.ANTHROPIC_API_KEY;
|
||||
function findClaudeCli(): string | null {
|
||||
for (const p of CLI_PATHS) {
|
||||
if (existsSync(p)) return p;
|
||||
}
|
||||
// Fallback: check PATH
|
||||
try {
|
||||
const which = execSync('which claude 2>/dev/null', { encoding: 'utf-8' }).trim();
|
||||
if (which) return which;
|
||||
} catch {
|
||||
// not found
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ask Claude to evaluate whether `actual` output satisfies `criteria`.
|
||||
*
|
||||
* Returns a structured verdict with pass/fail, reasoning, and confidence.
|
||||
* Throws if API call fails (caller should catch and handle).
|
||||
* Determine which backend to use.
|
||||
* Env var LLM_JUDGE_BACKEND overrides auto-detection.
|
||||
* Auto: CLI if available, then API if key set, else null.
|
||||
*/
|
||||
export async function judge(
|
||||
criteria: string,
|
||||
actual: string,
|
||||
context?: string,
|
||||
): Promise<JudgeVerdict> {
|
||||
const apiKey = process.env.ANTHROPIC_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error('ANTHROPIC_API_KEY not set — LLM judge unavailable');
|
||||
}
|
||||
function resolveBackend(): JudgeBackend | null {
|
||||
const explicit = process.env.LLM_JUDGE_BACKEND?.toLowerCase();
|
||||
if (explicit === 'cli') return findClaudeCli() ? 'cli' : null;
|
||||
if (explicit === 'api') return process.env.ANTHROPIC_API_KEY ? 'api' : null;
|
||||
|
||||
const systemPrompt = `You are a test assertion judge for a terminal emulator application called BTerminal.
|
||||
// Auto-detect: CLI first, API fallback
|
||||
if (findClaudeCli()) return 'cli';
|
||||
if (process.env.ANTHROPIC_API_KEY) return 'api';
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the LLM judge is available (CLI installed or API key set).
|
||||
*/
|
||||
export function isJudgeAvailable(): boolean {
|
||||
return resolveBackend() !== null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the prompt for the judge.
|
||||
*/
|
||||
function buildPrompt(criteria: string, actual: string, context?: string): { system: string; user: string } {
|
||||
const system = `You are a test assertion judge for a terminal emulator application called BTerminal.
|
||||
Your job is to evaluate whether actual output from the application meets the given criteria.
|
||||
Respond with EXACTLY this JSON format, nothing else:
|
||||
{"pass": true/false, "reasoning": "brief explanation", "confidence": 0.0-1.0}`;
|
||||
|
||||
const userPrompt = [
|
||||
const user = [
|
||||
'## Criteria',
|
||||
criteria,
|
||||
'',
|
||||
|
|
@ -51,6 +92,71 @@ Respond with EXACTLY this JSON format, nothing else:
|
|||
'Does the actual output satisfy the criteria? Respond with JSON only.',
|
||||
].join('\n');
|
||||
|
||||
return { system, user };
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract and validate a JudgeVerdict from raw text output.
|
||||
*/
|
||||
function parseVerdict(text: string): JudgeVerdict {
|
||||
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
||||
if (!jsonMatch) {
|
||||
throw new Error(`LLM judge returned non-JSON: ${text}`);
|
||||
}
|
||||
|
||||
const verdict = JSON.parse(jsonMatch[0]) as JudgeVerdict;
|
||||
|
||||
if (typeof verdict.pass !== 'boolean') {
|
||||
throw new Error(`LLM judge returned invalid verdict: ${text}`);
|
||||
}
|
||||
verdict.confidence = Number(verdict.confidence) || 0;
|
||||
verdict.reasoning = String(verdict.reasoning || '');
|
||||
|
||||
return verdict;
|
||||
}
|
||||
|
||||
/**
|
||||
* Judge via Claude CLI (spawns subprocess).
|
||||
* Unsets CLAUDECODE to avoid nested session errors.
|
||||
*/
|
||||
async function judgeCli(
|
||||
criteria: string,
|
||||
actual: string,
|
||||
context?: string,
|
||||
): Promise<JudgeVerdict> {
|
||||
const cliPath = findClaudeCli();
|
||||
if (!cliPath) throw new Error('Claude CLI not found');
|
||||
|
||||
const { system, user } = buildPrompt(criteria, actual, context);
|
||||
const fullPrompt = `${system}\n\n${user}`;
|
||||
|
||||
const output = execFileSync(cliPath, [
|
||||
'-p', fullPrompt,
|
||||
'--model', MODEL,
|
||||
'--output-format', 'text',
|
||||
], {
|
||||
encoding: 'utf-8',
|
||||
timeout: 60_000,
|
||||
env: { ...process.env, CLAUDECODE: '' },
|
||||
maxBuffer: 1024 * 1024,
|
||||
});
|
||||
|
||||
return parseVerdict(output);
|
||||
}
|
||||
|
||||
/**
|
||||
* Judge via Anthropic REST API (raw fetch).
|
||||
*/
|
||||
async function judgeApi(
|
||||
criteria: string,
|
||||
actual: string,
|
||||
context?: string,
|
||||
): Promise<JudgeVerdict> {
|
||||
const apiKey = process.env.ANTHROPIC_API_KEY;
|
||||
if (!apiKey) throw new Error('ANTHROPIC_API_KEY not set');
|
||||
|
||||
const { system, user } = buildPrompt(criteria, actual, context);
|
||||
|
||||
const response = await fetch(API_URL, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
|
|
@ -61,8 +167,8 @@ Respond with EXACTLY this JSON format, nothing else:
|
|||
body: JSON.stringify({
|
||||
model: MODEL,
|
||||
max_tokens: MAX_TOKENS,
|
||||
system: systemPrompt,
|
||||
messages: [{ role: 'user', content: userPrompt }],
|
||||
system,
|
||||
messages: [{ role: 'user', content: user }],
|
||||
}),
|
||||
});
|
||||
|
||||
|
|
@ -74,27 +180,37 @@ Respond with EXACTLY this JSON format, nothing else:
|
|||
const data = await response.json();
|
||||
const text = data.content?.[0]?.text ?? '';
|
||||
|
||||
// Extract JSON from response (may have markdown fences)
|
||||
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
||||
if (!jsonMatch) {
|
||||
throw new Error(`LLM judge returned non-JSON: ${text}`);
|
||||
return parseVerdict(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Ask Claude to evaluate whether `actual` output satisfies `criteria`.
|
||||
*
|
||||
* Uses CLI backend by default, falls back to API. Override with
|
||||
* LLM_JUDGE_BACKEND env var ("cli" or "api").
|
||||
*
|
||||
* Returns a structured verdict with pass/fail, reasoning, and confidence.
|
||||
* Throws if no backend available or call fails.
|
||||
*/
|
||||
export async function judge(
|
||||
criteria: string,
|
||||
actual: string,
|
||||
context?: string,
|
||||
): Promise<JudgeVerdict> {
|
||||
const backend = resolveBackend();
|
||||
if (!backend) {
|
||||
throw new Error('LLM judge unavailable — no Claude CLI found and ANTHROPIC_API_KEY not set');
|
||||
}
|
||||
|
||||
const verdict = JSON.parse(jsonMatch[0]) as JudgeVerdict;
|
||||
|
||||
// Validate structure
|
||||
if (typeof verdict.pass !== 'boolean') {
|
||||
throw new Error(`LLM judge returned invalid verdict: ${text}`);
|
||||
if (backend === 'cli') {
|
||||
return judgeCli(criteria, actual, context);
|
||||
}
|
||||
verdict.confidence = Number(verdict.confidence) || 0;
|
||||
verdict.reasoning = String(verdict.reasoning || '');
|
||||
|
||||
return verdict;
|
||||
return judgeApi(criteria, actual, context);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience: judge with a minimum confidence threshold.
|
||||
* Returns true only if pass=true AND confidence >= threshold.
|
||||
* Returns pass=true only if verdict.pass=true AND confidence >= threshold.
|
||||
*/
|
||||
export async function assertWithJudge(
|
||||
criteria: string,
|
||||
|
|
|
|||
|
|
@ -217,7 +217,7 @@ describe('Scenario B3 — Status Bar Fleet State', () => {
|
|||
// ─── Scenario B4: LLM-judged agent response (requires API key) ──────
|
||||
|
||||
describe('Scenario B4 — LLM-Judged Agent Response', () => {
|
||||
const SKIP_MSG = 'Skipping — ANTHROPIC_API_KEY not set';
|
||||
const SKIP_MSG = 'Skipping — LLM judge not available (no CLI or API key)';
|
||||
|
||||
it('should send prompt and get meaningful response', async function () {
|
||||
if (!isJudgeAvailable()) {
|
||||
|
|
@ -297,7 +297,7 @@ describe('Scenario B4 — LLM-Judged Agent Response', () => {
|
|||
// ─── Scenario B5: LLM-judged code generation quality ─────────────────
|
||||
|
||||
describe('Scenario B5 — LLM-Judged Code Generation', () => {
|
||||
const SKIP_MSG = 'Skipping — ANTHROPIC_API_KEY not set';
|
||||
const SKIP_MSG = 'Skipping — LLM judge not available (no CLI or API key)';
|
||||
|
||||
it('should generate valid code when asked', async function () {
|
||||
if (!isJudgeAvailable()) {
|
||||
|
|
|
|||
|
|
@ -559,7 +559,7 @@ describe('Scenario C9 — Files Tab & Code Editor', () => {
|
|||
describe('Scenario C10 — LLM-Judged Settings Completeness', () => {
|
||||
it('should have comprehensive settings panel', async function () {
|
||||
if (!isJudgeAvailable()) {
|
||||
console.log('Skipping — ANTHROPIC_API_KEY not set');
|
||||
console.log('Skipping — LLM judge not available (no CLI or API key)');
|
||||
this.skip();
|
||||
return;
|
||||
}
|
||||
|
|
@ -597,7 +597,7 @@ describe('Scenario C10 — LLM-Judged Settings Completeness', () => {
|
|||
describe('Scenario C11 — LLM-Judged Status Bar Completeness', () => {
|
||||
it('should render a comprehensive status bar', async function () {
|
||||
if (!isJudgeAvailable()) {
|
||||
console.log('Skipping — ANTHROPIC_API_KEY not set');
|
||||
console.log('Skipping — LLM judge not available (no CLI or API key)');
|
||||
this.skip();
|
||||
return;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue