/** * log-evaluator.ts — SSH to VPS + tail/grep agent JSONL logs. * * The agent-wsl-lucas runs in `agents_and_robots.service` on organic-machine.com. * Per-agent logs live in /home/ubuntu/CodeProyects/agents_and_robots/logs//YYYY-MM-DD.jsonl * (slog JSON handler — one JSON object per line). * * This fixture is used by DoD Capa 2 e2e tests to *cross-check* what the bot * said in Matrix against what the runtime actually did. A bot can hallucinate * output and never invoke a tool; reading logs catches that. */ import { execFileSync } from "node:child_process"; export interface LogEntry { time: string; level: string; msg: string; agent_id?: string; tool?: string; call_id?: string; request_id?: string; err?: string; // arbitrary structured fields [k: string]: unknown; } export interface ToolCallTrace { toolName: string; callId: string; ts: string; raw: LogEntry; } export interface FetchLogsOptions { agentId: string; sshTarget?: string; sinceMinutes?: number; filterMsg?: string; limit?: number; // Override (testing): read from a local file instead of SSH. localFile?: string; } const DEFAULT_SSH_TARGET = process.env.AGENT_LOG_SSH_TARGET ?? "organic-machine.com"; const DEFAULT_LOG_BASE = process.env.AGENT_LOG_BASE_DIR ?? "/home/ubuntu/CodeProyects/agents_and_robots/logs"; function isoToday(): string { // Logs are in UTC; the slog handler uses time.Now() which the launcher serializes as RFC3339. // File names use YYYY-MM-DD in UTC. const d = new Date(); const y = d.getUTCFullYear(); const m = String(d.getUTCMonth() + 1).padStart(2, "0"); const day = String(d.getUTCDate()).padStart(2, "0"); return `${y}-${m}-${day}`; } function isoYesterday(): string { const d = new Date(Date.now() - 24 * 60 * 60 * 1000); const y = d.getUTCFullYear(); const m = String(d.getUTCMonth() + 1).padStart(2, "0"); const day = String(d.getUTCDate()).padStart(2, "0"); return `${y}-${m}-${day}`; } /** * Run a command on the VPS via ssh. Throws if exit != 0. * Uses execFileSync to avoid shell-injection on the local side. */ function sshExec(sshTarget: string, remoteCmd: string): string { try { const out = execFileSync( "ssh", [ "-o", "BatchMode=yes", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=accept-new", sshTarget, remoteCmd, ], { encoding: "utf8", maxBuffer: 8 * 1024 * 1024 }, ); return out; } catch (err: any) { const stderr = err?.stderr?.toString?.() ?? ""; const stdout = err?.stdout?.toString?.() ?? ""; throw new Error( `ssh ${sshTarget} failed: ${err.message}\nstderr=${stderr}\nstdout=${stdout}`, ); } } /** Read N last entries from the agent log, optionally grep-filtered. */ export async function fetchAgentLogs(opts: FetchLogsOptions): Promise { const sinceMinutes = opts.sinceMinutes ?? 5; const limit = opts.limit ?? 200; const target = opts.sshTarget ?? DEFAULT_SSH_TARGET; // We pull TODAY's log file (UTC). If the test crosses midnight, also grab yesterday. // tail+grep is good enough; we will JSON-parse and filter by time client-side. const today = isoToday(); const yesterday = isoYesterday(); const baseDir = DEFAULT_LOG_BASE; const agentDir = `${baseDir}/${opts.agentId}`; // Read both files (best-effort) and let the time filter cut. // Limit per-file tail to keep ssh response bounded. const perFileTail = Math.max(limit * 5, 1000); let raw: string; if (opts.localFile) { // Local override path for self-test / dev const fs = require("node:fs"); raw = fs.readFileSync(opts.localFile, "utf8"); } else { const cmd = // `2>/dev/null || true` so missing files don't make ssh exit non-zero `(tail -n ${perFileTail} ${agentDir}/${yesterday}.jsonl 2>/dev/null || true; ` + `tail -n ${perFileTail} ${agentDir}/${today}.jsonl 2>/dev/null || true)`; raw = sshExec(target, cmd); } const sinceMs = Date.now() - sinceMinutes * 60 * 1000; const entries: LogEntry[] = []; for (const line of raw.split("\n")) { const trimmed = line.trim(); if (!trimmed) continue; let obj: LogEntry; try { obj = JSON.parse(trimmed); } catch { continue; } // Time filter const t = obj.time ? Date.parse(obj.time) : NaN; if (!Number.isFinite(t) || t < sinceMs) continue; if (opts.filterMsg && !(obj.msg ?? "").includes(opts.filterMsg)) continue; entries.push(obj); } // Keep last `limit` return entries.slice(-limit); } /** * Find the most recent log entry for an executing-tool call where tool matches. * * The launcher emits: logger.Info("executing tool", "tool", tc.Name, "call_id", tc.ID) * in devagents/llm.go (line 125). We grep that as the canonical tool-call trace. */ export async function findLastToolCall(opts: { agentId: string; toolName: string; sinceMinutes?: number; sshTarget?: string; }): Promise { const logs = await fetchAgentLogs({ agentId: opts.agentId, sinceMinutes: opts.sinceMinutes ?? 5, sshTarget: opts.sshTarget, filterMsg: "executing tool", limit: 500, }); for (let i = logs.length - 1; i >= 0; i--) { const e = logs[i]; if (e.msg === "executing tool" && e.tool === opts.toolName) { return { toolName: opts.toolName, callId: String(e.call_id ?? ""), ts: e.time, raw: e, }; } } return null; } /** Find ANY executing-tool call regardless of tool name. */ export async function findAnyToolCalls(opts: { agentId: string; sinceMinutes?: number; sshTarget?: string; }): Promise { const logs = await fetchAgentLogs({ agentId: opts.agentId, sinceMinutes: opts.sinceMinutes ?? 5, sshTarget: opts.sshTarget, filterMsg: "executing tool", limit: 500, }); return logs .filter((e) => e.msg === "executing tool" && typeof e.tool === "string") .map((e) => ({ toolName: String(e.tool), callId: String(e.call_id ?? ""), ts: e.time, raw: e, })); } /** Throws if any ERROR-level entry exists in the window (allowlist optional). */ export async function assertNoErrors(opts: { agentId: string; sinceMinutes?: number; sshTarget?: string; // Substrings on `msg` or `err` that are acceptable to ignore ignore?: RegExp[]; }): Promise { const logs = await fetchAgentLogs({ agentId: opts.agentId, sinceMinutes: opts.sinceMinutes ?? 5, sshTarget: opts.sshTarget, limit: 1000, }); const errors = logs.filter((e) => e.level === "ERROR"); const unexpected = errors.filter((e) => { if (!opts.ignore || opts.ignore.length === 0) return true; const blob = `${e.msg ?? ""} ${e.err ?? ""}`; return !opts.ignore.some((rx) => rx.test(blob)); }); if (unexpected.length > 0) { const sample = unexpected .slice(0, 5) .map((e) => `[${e.time}] ${e.msg} err=${e.err}`) .join("\n"); throw new Error( `Agent log has ${unexpected.length} ERROR entries in last ` + `${opts.sinceMinutes ?? 5}min:\n${sample}`, ); } } /** * Best-effort latency measurement. * The launcher does NOT emit a single correlated "reply_sent" with the same id; * we approximate by measuring distance between `message_received` and the * next `tool_use loop complete` / final response log in the same agent. * If no pair found, returns null. */ export async function measureReplyLatency(opts: { agentId: string; sinceMinutes?: number; sshTarget?: string; }): Promise { const logs = await fetchAgentLogs({ agentId: opts.agentId, sinceMinutes: opts.sinceMinutes ?? 10, sshTarget: opts.sshTarget, limit: 2000, }); // We look for pairs: "message_received" → next "llm completion" or "executing tool" // ending with "reply sent" / "tool_use loop done". Heuristic: pair each // message_received with the next log at level INFO emitted within 60s. let last: number | null = null; for (let i = 0; i < logs.length - 1; i++) { const a = logs[i]; if (a.msg !== "message_received") continue; const aT = Date.parse(a.time); for (let j = i + 1; j < logs.length; j++) { const b = logs[j]; const bT = Date.parse(b.time); if (bT - aT > 60_000) break; if ( b.msg === "executing tool" || b.msg === "llm response" || b.msg === "tool_use_loop_done" || (typeof b.msg === "string" && b.msg.includes("reply")) ) { last = bT - aT; break; } } } return last; } /** * Service uptime via systemd (best-effort). Returns seconds since * ActiveEnterTimestamp, or null if unable to read. */ export async function fetchServiceUptimeSec(opts: { sshTarget?: string; unit?: string; }): Promise { const target = opts.sshTarget ?? DEFAULT_SSH_TARGET; const unit = opts.unit ?? "agents_and_robots.service"; try { const out = sshExec( target, `systemctl show ${unit} --property=ActiveEnterTimestamp --value 2>/dev/null || true`, ); const stamp = out.trim(); if (!stamp) return null; const t = Date.parse(stamp); if (!Number.isFinite(t)) return null; return Math.floor((Date.now() - t) / 1000); } catch { return null; } }