feat: refactor LLM judge to dual-mode CLI/API and fix config test race

Refactor llm-judge.ts from raw API-only to dual-mode: CLI first (spawns claude with --output-format text, unsets CLAUDECODE), API fallback. Backend selectable via LLM_JUDGE_BACKEND env var. Fix pre-existing race condition in config.rs tests where parallel test execution caused env var mutations to interfere. Added static Mutex to serialize env-mutating tests.
2026-03-12 06:35:04 +01:00 · 2026-03-12 06:35:04 +01:00 · a3185656eb
commit a3185656eb
parent 05c9e1abbb
4 changed files with 169 additions and 42 deletions
--- a/v2/bterminal-core/src/config.rs
+++ b/v2/bterminal-core/src/config.rs
@ -109,9 +109,15 @@ impl AppConfig {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use std::sync::Mutex;
+
+    // Serialize all tests that mutate env vars to prevent race conditions.
+    // Rust runs tests in parallel; set_var/remove_var are process-global.
+    static ENV_LOCK: Mutex<()> = Mutex::new(());

    #[test]
    fn test_production_paths_use_dirs() {
+        let _lock = ENV_LOCK.lock().unwrap();
        // Without BTERMINAL_TEST=1, paths should use dirs:: defaults
        std::env::remove_var("BTERMINAL_TEST");
        std::env::remove_var("BTERMINAL_TEST_DATA_DIR");
@ -129,6 +135,7 @@ mod tests {

    #[test]
    fn test_btmsg_db_path() {
+        let _lock = ENV_LOCK.lock().unwrap();
        std::env::remove_var("BTERMINAL_TEST");
        let config = AppConfig::from_env();
        let path = config.btmsg_db_path();
@ -138,6 +145,7 @@ mod tests {

    #[test]
    fn test_groups_json_path() {
+        let _lock = ENV_LOCK.lock().unwrap();
        std::env::remove_var("BTERMINAL_TEST");
        let config = AppConfig::from_env();
        let path = config.groups_json_path();
@ -146,6 +154,7 @@ mod tests {

    #[test]
    fn test_test_mode_uses_overrides() {
+        let _lock = ENV_LOCK.lock().unwrap();
        std::env::set_var("BTERMINAL_TEST", "1");
        std::env::set_var("BTERMINAL_TEST_DATA_DIR", "/tmp/bt-test-data");
        std::env::set_var("BTERMINAL_TEST_CONFIG_DIR", "/tmp/bt-test-config");
@ -168,6 +177,7 @@ mod tests {

    #[test]
    fn test_test_mode_without_overrides_uses_defaults() {
+        let _lock = ENV_LOCK.lock().unwrap();
        std::env::set_var("BTERMINAL_TEST", "1");
        std::env::remove_var("BTERMINAL_TEST_DATA_DIR");
        std::env::remove_var("BTERMINAL_TEST_CONFIG_DIR");
@ -183,6 +193,7 @@ mod tests {

    #[test]
    fn test_test_mode_memora_in_data_dir() {
+        let _lock = ENV_LOCK.lock().unwrap();
        std::env::set_var("BTERMINAL_TEST", "1");
        std::env::set_var("BTERMINAL_TEST_DATA_DIR", "/tmp/bt-test-data");

--- a/v2/tests/e2e/llm-judge.ts
+++ b/v2/tests/e2e/llm-judge.ts
@ -1,11 +1,31 @@
-// LLM Judge — evaluates test outcomes via Claude API
-// Uses raw fetch (no SDK dep). Requires ANTHROPIC_API_KEY env var.
-// Skips gracefully when API key is absent.
+// LLM Judge — evaluates test outcomes via Claude.
+//
+// Two backends, configurable via LLM_JUDGE_BACKEND env var:
+//   "cli"  — Claude CLI (default, no API key needed)
+//   "api"  — Anthropic REST API (requires ANTHROPIC_API_KEY)
+//
+// CLI backend: spawns `claude` with --output-format text, parses JSON verdict.
+// API backend: raw fetch to messages API, same JSON verdict parsing.
+//
+// Skips gracefully when neither backend is available.

+import { execFileSync, execSync } from 'node:child_process';
+import { existsSync } from 'node:fs';
+
+const MODEL = 'claude-haiku-4-5-20251001';
 const API_URL = 'https://api.anthropic.com/v1/messages';
-const MODEL = 'claude-haiku-4-5-20251001'; // Fast + cheap for test judging
 const MAX_TOKENS = 512;

+// CLI search paths (in order)
+const CLI_PATHS = [
+  `${process.env.HOME}/.local/bin/claude`,
+  `${process.env.HOME}/.claude/local/claude`,
+  '/usr/local/bin/claude',
+  '/usr/bin/claude',
+];
+
+export type JudgeBackend = 'cli' | 'api';
+
 export interface JudgeVerdict {
  pass: boolean;
  reasoning: string;
@ -13,34 +33,55 @@ export interface JudgeVerdict {
 }

 /**
- * Check if the LLM judge is available (API key set).
+ * Find the Claude CLI binary path, or null if not installed.
 */
-export function isJudgeAvailable(): boolean {
-  return !!process.env.ANTHROPIC_API_KEY;
+function findClaudeCli(): string | null {
+  for (const p of CLI_PATHS) {
+    if (existsSync(p)) return p;
+  }
+  // Fallback: check PATH
+  try {
+    const which = execSync('which claude 2>/dev/null', { encoding: 'utf-8' }).trim();
+    if (which) return which;
+  } catch {
+    // not found
+  }
+  return null;
 }

 /**
- * Ask Claude to evaluate whether `actual` output satisfies `criteria`.
- *
- * Returns a structured verdict with pass/fail, reasoning, and confidence.
- * Throws if API call fails (caller should catch and handle).
+ * Determine which backend to use.
+ * Env var LLM_JUDGE_BACKEND overrides auto-detection.
+ * Auto: CLI if available, then API if key set, else null.
 */
-export async function judge(
-  criteria: string,
-  actual: string,
-  context?: string,
-): Promise<JudgeVerdict> {
-  const apiKey = process.env.ANTHROPIC_API_KEY;
-  if (!apiKey) {
-    throw new Error('ANTHROPIC_API_KEY not set — LLM judge unavailable');
-  }
+function resolveBackend(): JudgeBackend | null {
+  const explicit = process.env.LLM_JUDGE_BACKEND?.toLowerCase();
+  if (explicit === 'cli') return findClaudeCli() ? 'cli' : null;
+  if (explicit === 'api') return process.env.ANTHROPIC_API_KEY ? 'api' : null;

-  const systemPrompt = `You are a test assertion judge for a terminal emulator application called BTerminal.
+  // Auto-detect: CLI first, API fallback
+  if (findClaudeCli()) return 'cli';
+  if (process.env.ANTHROPIC_API_KEY) return 'api';
+  return null;
+}
+
+/**
+ * Check if the LLM judge is available (CLI installed or API key set).
+ */
+export function isJudgeAvailable(): boolean {
+  return resolveBackend() !== null;
+}
+
+/**
+ * Build the prompt for the judge.
+ */
+function buildPrompt(criteria: string, actual: string, context?: string): { system: string; user: string } {
+  const system = `You are a test assertion judge for a terminal emulator application called BTerminal.
 Your job is to evaluate whether actual output from the application meets the given criteria.
 Respond with EXACTLY this JSON format, nothing else:
 {"pass": true/false, "reasoning": "brief explanation", "confidence": 0.0-1.0}`;

-  const userPrompt = [
+  const user = [
    '## Criteria',
    criteria,
    '',
@ -51,6 +92,71 @@ Respond with EXACTLY this JSON format, nothing else:
    'Does the actual output satisfy the criteria? Respond with JSON only.',
  ].join('\n');

+  return { system, user };
+}
+
+/**
+ * Extract and validate a JudgeVerdict from raw text output.
+ */
+function parseVerdict(text: string): JudgeVerdict {
+  const jsonMatch = text.match(/\{[\s\S]*\}/);
+  if (!jsonMatch) {
+    throw new Error(`LLM judge returned non-JSON: ${text}`);
+  }
+
+  const verdict = JSON.parse(jsonMatch[0]) as JudgeVerdict;
+
+  if (typeof verdict.pass !== 'boolean') {
+    throw new Error(`LLM judge returned invalid verdict: ${text}`);
+  }
+  verdict.confidence = Number(verdict.confidence) || 0;
+  verdict.reasoning = String(verdict.reasoning || '');
+
+  return verdict;
+}
+
+/**
+ * Judge via Claude CLI (spawns subprocess).
+ * Unsets CLAUDECODE to avoid nested session errors.
+ */
+async function judgeCli(
+  criteria: string,
+  actual: string,
+  context?: string,
+): Promise<JudgeVerdict> {
+  const cliPath = findClaudeCli();
+  if (!cliPath) throw new Error('Claude CLI not found');
+
+  const { system, user } = buildPrompt(criteria, actual, context);
+  const fullPrompt = `${system}\n\n${user}`;
+
+  const output = execFileSync(cliPath, [
+    '-p', fullPrompt,
+    '--model', MODEL,
+    '--output-format', 'text',
+  ], {
+    encoding: 'utf-8',
+    timeout: 60_000,
+    env: { ...process.env, CLAUDECODE: '' },
+    maxBuffer: 1024 * 1024,
+  });
+
+  return parseVerdict(output);
+}
+
+/**
+ * Judge via Anthropic REST API (raw fetch).
+ */
+async function judgeApi(
+  criteria: string,
+  actual: string,
+  context?: string,
+): Promise<JudgeVerdict> {
+  const apiKey = process.env.ANTHROPIC_API_KEY;
+  if (!apiKey) throw new Error('ANTHROPIC_API_KEY not set');
+
+  const { system, user } = buildPrompt(criteria, actual, context);
+
  const response = await fetch(API_URL, {
    method: 'POST',
    headers: {
@ -61,8 +167,8 @@ Respond with EXACTLY this JSON format, nothing else:
    body: JSON.stringify({
      model: MODEL,
      max_tokens: MAX_TOKENS,
-      system: systemPrompt,
-      messages: [{ role: 'user', content: userPrompt }],
+      system,
+      messages: [{ role: 'user', content: user }],
    }),
  });

@ -74,27 +180,37 @@ Respond with EXACTLY this JSON format, nothing else:
  const data = await response.json();
  const text = data.content?.[0]?.text ?? '';

-  // Extract JSON from response (may have markdown fences)
-  const jsonMatch = text.match(/\{[\s\S]*\}/);
-  if (!jsonMatch) {
-    throw new Error(`LLM judge returned non-JSON: ${text}`);
+  return parseVerdict(text);
+}
+
+/**
+ * Ask Claude to evaluate whether `actual` output satisfies `criteria`.
+ *
+ * Uses CLI backend by default, falls back to API. Override with
+ * LLM_JUDGE_BACKEND env var ("cli" or "api").
+ *
+ * Returns a structured verdict with pass/fail, reasoning, and confidence.
+ * Throws if no backend available or call fails.
+ */
+export async function judge(
+  criteria: string,
+  actual: string,
+  context?: string,
+): Promise<JudgeVerdict> {
+  const backend = resolveBackend();
+  if (!backend) {
+    throw new Error('LLM judge unavailable — no Claude CLI found and ANTHROPIC_API_KEY not set');
  }

-  const verdict = JSON.parse(jsonMatch[0]) as JudgeVerdict;
-
-  // Validate structure
-  if (typeof verdict.pass !== 'boolean') {
-    throw new Error(`LLM judge returned invalid verdict: ${text}`);
+  if (backend === 'cli') {
+    return judgeCli(criteria, actual, context);
  }
-  verdict.confidence = Number(verdict.confidence) || 0;
-  verdict.reasoning = String(verdict.reasoning || '');
-
-  return verdict;
+  return judgeApi(criteria, actual, context);
 }

 /**
 * Convenience: judge with a minimum confidence threshold.
- * Returns true only if pass=true AND confidence >= threshold.
+ * Returns pass=true only if verdict.pass=true AND confidence >= threshold.
 */
 export async function assertWithJudge(
  criteria: string,
--- a/v2/tests/e2e/specs/phase-b.test.ts
+++ b/v2/tests/e2e/specs/phase-b.test.ts
@ -217,7 +217,7 @@ describe('Scenario B3 — Status Bar Fleet State', () => {
 // ─── Scenario B4: LLM-judged agent response (requires API key) ──────

 describe('Scenario B4 — LLM-Judged Agent Response', () => {
-  const SKIP_MSG = 'Skipping — ANTHROPIC_API_KEY not set';
+  const SKIP_MSG = 'Skipping — LLM judge not available (no CLI or API key)';

  it('should send prompt and get meaningful response', async function () {
    if (!isJudgeAvailable()) {
@ -297,7 +297,7 @@ describe('Scenario B4 — LLM-Judged Agent Response', () => {
 // ─── Scenario B5: LLM-judged code generation quality ─────────────────

 describe('Scenario B5 — LLM-Judged Code Generation', () => {
-  const SKIP_MSG = 'Skipping — ANTHROPIC_API_KEY not set';
+  const SKIP_MSG = 'Skipping — LLM judge not available (no CLI or API key)';

  it('should generate valid code when asked', async function () {
    if (!isJudgeAvailable()) {
--- a/v2/tests/e2e/specs/phase-c.test.ts
+++ b/v2/tests/e2e/specs/phase-c.test.ts
@ -559,7 +559,7 @@ describe('Scenario C9 — Files Tab & Code Editor', () => {
 describe('Scenario C10 — LLM-Judged Settings Completeness', () => {
  it('should have comprehensive settings panel', async function () {
    if (!isJudgeAvailable()) {
-      console.log('Skipping — ANTHROPIC_API_KEY not set');
+      console.log('Skipping — LLM judge not available (no CLI or API key)');
      this.skip();
      return;
    }
@ -597,7 +597,7 @@ describe('Scenario C10 — LLM-Judged Settings Completeness', () => {
 describe('Scenario C11 — LLM-Judged Status Bar Completeness', () => {
  it('should render a comprehensive status bar', async function () {
    if (!isJudgeAvailable()) {
-      console.log('Skipping — ANTHROPIC_API_KEY not set');
+      console.log('Skipping — LLM judge not available (no CLI or API key)');
      this.skip();
      return;
    }