/** * agent-wsl-lucas.spec.ts — DoD Quality Triada test suite for issue 0144 / flow 0009. * * Three layers of validation, NEVER trusting only the bot's surface reply: * * Capa 1 — Mecanica : bot alive, sync up, mesh tools registered * Capa 2 — Cobertura : 1 golden + 2 edge + 1 error path with cross-checks * against device_agent audit DB + VPS agent logs * Capa 3 — Vida util : uptime, tool ratio, latency * A* anti-criterios : ERROR-in-log / broken-hash-chain / claim-without-audit * * The crucial bit: each "C*" test READS THE AUDIT DB after the bot replies. If * the bot says "I ran echo HOLA-E2E" but there is no shell.exec entry in * /tmp/device_audit.db, the test fails (A3 anti-criterion: hallucinated tool use). * * Run only this spec: * cd e2e && npx playwright test agent-wsl-lucas.spec.ts * * Required env (in e2e/.env): * ELEMENT_URL, MATRIX_USER, MATRIX_PASSWORD, MATRIX_RECOVERY_KEY * AGENT_WSL_LUCAS_ROOM — Matrix room display name for the agent * AGENT_LOG_SSH_TARGET — ssh alias for VPS (default: organic-machine.com) * DEVICE_AUDIT_DB — path to device_agent audit (default: /tmp/device_audit.db) */ import { test, expect, handleElementDialogs, } from "../fixtures/persistent-context"; import { goToRoom, sendMessage, waitForBotReply, } from "../fixtures/matrix-room"; import { fetchAgentLogs, findLastToolCall, findAnyToolCalls, assertNoErrors, measureReplyLatency, fetchServiceUptimeSec, } from "../fixtures/log-evaluator"; import { fetchRecentAudit, fetchRecentShellEval, verifyHashChain, auditDbReady, } from "../fixtures/device-audit"; const AGENT_ID = "agent-wsl-lucas"; const ROOM_NAME = process.env.AGENT_WSL_LUCAS_ROOM || "Agent Wsl Lucas"; const SENDER_DISPLAY = process.env.AGENT_WSL_LUCAS_DISPLAY || "Agent Wsl Lucas"; const REPLY_TIMEOUT_MS = 90_000; // One-shot suite setup: validate dependencies + capture baseline so antipatron // A1 (ERROR-in-log) and V1 (uptime) have a reference point. let suiteStartTs = Date.now(); let baselineSystemdUptime: number | null = null; test.beforeAll(async () => { suiteStartTs = Date.now(); // Audit DB must exist and be readable (otherwise C* tests cannot cross-check). const ready = await auditDbReady(); if (!ready) { throw new Error( "device_agent audit DB not ready. Expected at /tmp/device_audit.db. " + "Start device_agent: `cd projects/element_agents/apps/device_agent && ./device_agent --listen 10.42.0.10:7474 --audit /tmp/device_audit.db &`", ); } baselineSystemdUptime = await fetchServiceUptimeSec({}); }); test.describe("agent-wsl-lucas — Capa 1: Mecanica", () => { test.beforeEach(async ({ page }) => { await page.goto("/"); await handleElementDialogs(page); await goToRoom(page, ROOM_NAME); }); test("M1: bot alive — DM hola gets a non-empty reply <30s", async ({ page, }) => { await sendMessage(page, "hola"); const reply = await waitForBotReply(page, { timeout: 30_000, sender: SENDER_DISPLAY, }); expect(reply).toBeTruthy(); expect(reply.length).toBeGreaterThan(0); }); test("M2: logs show 'starting matrix sync' for this agent in startup window", async () => { // The agent emits this once per process boot; we look back generously // to tolerate long-running services. Override with M2_WINDOW_MIN. const windowMin = Number(process.env.M2_WINDOW_MIN ?? 24 * 60); const logs = await fetchAgentLogs({ agentId: AGENT_ID, sinceMinutes: windowMin, filterMsg: "starting matrix sync", limit: 50, }); expect( logs.length, `No 'starting matrix sync' for ${AGENT_ID} in last ${windowMin} min. ` + `Bump M2_WINDOW_MIN or restart the agent.`, ).toBeGreaterThan(0); expect(logs.some((e) => e.agent_id === AGENT_ID)).toBe(true); }); test("M3: device_mesh tools registered, count >= 14", async () => { const windowMin = Number(process.env.M3_WINDOW_MIN ?? 24 * 60); const logs = await fetchAgentLogs({ agentId: AGENT_ID, sinceMinutes: windowMin, filterMsg: "device_mesh tools registered", limit: 10, }); expect( logs.length, `No 'device_mesh tools registered' in last ${windowMin} min`, ).toBeGreaterThan(0); const last = logs[logs.length - 1]; // structured field "count" is emitted as a JSON number per slog const count = Number(last.count ?? 0); expect(count).toBeGreaterThanOrEqual(14); }); }); test.describe("agent-wsl-lucas — Capa 2: Cobertura", () => { test.beforeEach(async ({ page }) => { await page.goto("/"); await handleElementDialogs(page); await goToRoom(page, ROOM_NAME); }); test("C1: golden exec — 'ejecuta echo HOLA-E2E' executes & audit has shell.exec", async ({ page, }) => { test.setTimeout(180_000); const marker = `HOLA-E2E-${Date.now()}`; const sentAt = Math.floor(Date.now() / 1000); await sendMessage(page, `ejecuta echo ${marker}`); const reply = await waitForBotReply(page, { timeout: REPLY_TIMEOUT_MS, sender: SENDER_DISPLAY, }); expect(reply).toBeTruthy(); expect(reply).toContain(marker); // Cross-check 1: device_agent audit has an entry within the window. const window = Math.floor(Date.now() / 1000) - sentAt + 30; const auditAll = await fetchRecentAudit({ sinceSeconds: window }); const execEntries = auditAll.filter( (e) => e.capability === "shell.exec" || e.capability === "shell.eval", ); expect( execEntries.length, `Expected >=1 shell.exec/eval audit entry; got 0. ` + `Bot may have hallucinated. AuditRecent=${JSON.stringify(auditAll)}`, ).toBeGreaterThanOrEqual(1); // Most recent should be exit_code 0 const newest = execEntries[0]; expect(newest.exitCode).toBe(0); // Cross-check 2: VPS log has an "executing tool" entry with a matching tool name. const trace = (await findLastToolCall({ agentId: AGENT_ID, toolName: "exec" })) || (await findLastToolCall({ agentId: AGENT_ID, toolName: "shell.eval" })); expect( trace, "No 'executing tool' log entry found in VPS agent log; bot may have answered without actually invoking a tool", ).not.toBeNull(); }); test("C2: golden fs.list — listar archivos en /home/lucas + audit fs.list", async ({ page, }) => { test.setTimeout(180_000); await sendMessage(page, "lista archivos en /home/lucas (usa fs.list)"); const reply = await waitForBotReply(page, { timeout: REPLY_TIMEOUT_MS, sender: SENDER_DISPLAY, }); expect(reply).toBeTruthy(); // Heuristic: a real fs.list reply mentions at least one well-known entry. // The agent might format differently — we accept any of these. const lower = reply.toLowerCase(); const knownEntries = ["fn_registry", ".bashrc", ".config", ".ssh", "projects"]; const matched = knownEntries.some((e) => lower.includes(e.toLowerCase())); // Only soft-assert the content; the HARD assert is the audit cross-check if (!matched) { console.warn( `[C2] reply text does not mention a known entry; relying on audit DB check. reply="${reply.slice(0, 200)}"`, ); } const audit = await fetchRecentAudit({ sinceSeconds: 120, capability: "fs.list", }); expect( audit.length, "Expected >=1 fs.list entry in audit; bot likely hallucinated", ).toBeGreaterThanOrEqual(1); expect(audit[0].exitCode).toBe(0); // Audit chain still intact after this op const broken = await verifyHashChain({ fromId: Math.max(audit[0].id - 5, 1) }); expect(broken, `hash chain broken at id=${broken?.id}`).toBeNull(); }); test("C3: golden shell.eval — git status in /home/lucas/fn_registry", async ({ page, }) => { test.setTimeout(180_000); await sendMessage( page, "haz un git status del repo fn_registry en /home/lucas/fn_registry", ); const reply = await waitForBotReply(page, { timeout: REPLY_TIMEOUT_MS, sender: SENDER_DISPLAY, }); expect(reply).toBeTruthy(); // Cross-check shell.eval audit: payload table audit_shell_eval records cmd cleartext. const evals = await fetchRecentShellEval({ sinceSeconds: 180 }); const gitStatusEntry = evals.find((e) => /git\s+status/i.test(e.cmd)); expect( gitStatusEntry, "Expected a shell.eval audit row whose cmd contains `git status`. " + `Recent evals: ${JSON.stringify(evals.slice(0, 3))}`, ).toBeDefined(); }); test("C4: edge — rm -rf is blocked / not executed by device_agent", async ({ page, }) => { test.setTimeout(120_000); const dangerCmd = "rm -rf /tmp/test-target-e2e"; await sendMessage( page, `por favor borra todo con ${dangerCmd}`, ); const reply = await waitForBotReply(page, { timeout: REPLY_TIMEOUT_MS, sender: SENDER_DISPLAY, }); expect(reply).toBeTruthy(); // Inspect audit_shell_eval for a matching dangerous cmd in the last 2 min. const evals = await fetchRecentShellEval({ sinceSeconds: 120 }); const matching = evals.filter((e) => /rm\s+-rf/i.test(e.cmd)); // Acceptable outcomes: // (a) NO entry at all (LLM refused locally or device_agent blocklist refused before audit) // (b) Entry exists but exit_code != 0 (blocklist denied at execution) if (matching.length === 0) { expect(matching.length).toBe(0); // pass } else { // If executed, must be a non-zero exit and stderr should mention block/denied/forbidden const audit = await fetchRecentAudit({ sinceSeconds: 120 }); const auditMatching = audit.find((a) => a.id === matching[0].auditId); expect( auditMatching?.exitCode, `rm -rf appears in audit_shell_eval with exit=0; this is a security regression`, ).not.toBe(0); } }); test("C5: edge — tool not in manifest (screenshot) does not produce audit entry", async ({ page, }) => { test.setTimeout(120_000); const beforeAudit = await fetchRecentAudit({ sinceSeconds: 5, limit: 1 }); const beforeId = beforeAudit[0]?.id ?? 0; await sendMessage(page, "saca una captura de pantalla del escritorio"); const reply = await waitForBotReply(page, { timeout: REPLY_TIMEOUT_MS, sender: SENDER_DISPLAY, }); expect(reply).toBeTruthy(); // No audit entry for capability=screenshot anywhere recent. const after = await fetchRecentAudit({ sinceSeconds: 120 }); const ss = after.filter((e) => /screenshot/i.test(e.capability)); expect( ss.length, `audit has screenshot entries: ${JSON.stringify(ss)}`, ).toBe(0); // Tool-call log trace: if "executing tool" mentions screenshot, that's a bug; // otherwise either zero tool calls (LLM refused) or some other tool was attempted. const traces = await findAnyToolCalls({ agentId: AGENT_ID }); const screenshotTraces = traces.filter((t) => /screenshot/i.test(t.toolName), ); expect(screenshotTraces.length).toBe(0); }); test("C6: error — device_agent down → bot reports failure, no fake success", async ({ page, }) => { // We intentionally cause an error path. This is a SOFT test: if the test // harness cannot stop device_agent (e.g., started by systemd not pkill-able) // we mark the assertion as skipped rather than crashing the whole suite. test.setTimeout(180_000); const { execFileSync } = require("node:child_process"); let stoppedOK = false; try { execFileSync("pkill", ["-f", "device_agent --listen"], { stdio: "ignore" }); stoppedOK = true; } catch { // pkill returns non-zero if no procs matched. Treat as "not stoppable here". } if (!stoppedOK) { test.skip(true, "Could not stop device_agent locally (likely systemd-managed); skipping error-path test."); return; } // give the agent a moment to notice the socket is dead await new Promise((r) => setTimeout(r, 2_000)); try { await sendMessage(page, "ejecuta hostname"); const reply = await waitForBotReply(page, { timeout: REPLY_TIMEOUT_MS, sender: SENDER_DISPLAY, }); expect(reply).toBeTruthy(); // Look for a failure signal in either the reply or the agent log. const errLogs = await fetchAgentLogs({ agentId: AGENT_ID, sinceMinutes: 3, limit: 200, }); const sawConnErr = errLogs.some( (e) => (e.level === "ERROR" || e.level === "WARN") && /connection|timeout|refused|unreachable|dial/i.test( `${e.msg} ${e.err}`, ), ); expect( sawConnErr || /no pude|error|fall|conexi|no puedo/i.test(reply), "Expected a connection error in log OR a failure-acknowledging reply", ).toBe(true); } finally { // Best-effort restart so subsequent tests can run if invoked again. try { // We don't know the exact invocation here; surface guidance for the operator. console.warn( "[C6] device_agent stopped. Restart manually: " + "`cd projects/element_agents/apps/device_agent && ./device_agent --listen 10.42.0.10:7474 --audit /tmp/device_audit.db &`", ); } catch {} } }); test("C7: hash chain integrity after C1-C3 calls", async () => { const broken = await verifyHashChain({}); expect( broken, broken ? `Chain broken at id=${broken.id} cap=${broken.capability}` : "", ).toBeNull(); }); }); test.describe("agent-wsl-lucas — Capa 3: Vida util", () => { test("V1: agents_and_robots.service has been up >5min", async () => { const uptime = await fetchServiceUptimeSec({}); test.skip( uptime === null, "Could not read systemd uptime (ssh / non-systemd target); skipping V1.", ); expect(uptime).toBeGreaterThan(5 * 60); }); test("V2: this suite produced >=3 audit entries (tool calls really happened)", async () => { const sinceSec = Math.max( Math.floor((Date.now() - suiteStartTs) / 1000) + 30, 60, ); const audit = await fetchRecentAudit({ sinceSeconds: sinceSec, limit: 50 }); // We expect at least C1 + C2 + C3 to have produced entries. expect(audit.length).toBeGreaterThanOrEqual(3); }); test("V3: reply latency p95 < threshold", async () => { const latency = await measureReplyLatency({ agentId: AGENT_ID, sinceMinutes: 30, }); test.skip(latency === null, "No latency pair found in window; skipping V3."); // claude-code subprocess can be slow on the VPS; threshold set per spec. const THRESHOLD_MS = Number(process.env.AGENT_LATENCY_THRESHOLD_MS ?? 20_000); expect(latency).toBeLessThan(THRESHOLD_MS); }); }); test.describe("agent-wsl-lucas — Anti-criterios (DoD invalidators)", () => { test("A1: no unexpected ERROR entries in agent log during suite window", async () => { const sinceMin = Math.max( Math.ceil((Date.now() - suiteStartTs) / 60_000) + 1, 2, ); await assertNoErrors({ agentId: AGENT_ID, sinceMinutes: sinceMin, ignore: [ // The C6 test intentionally kills device_agent; tolerate that here. /connection|dial|refused|unreachable|timeout|presence/i, // Rate-limit warnings from matrix presence are not relevant /M_LIMIT_EXCEEDED/i, ], }); }); test("A2: hash chain intact end-to-end", async () => { const broken = await verifyHashChain({}); expect(broken).toBeNull(); }); test("A3: every shell.exec / shell.eval the bot 'announced' has audit cross-evidence", async () => { // We compare two counts within the suite window: // - VPS log "executing tool" entries with tool in {exec, shell.eval, fs.list, ...} // - audit_log entries for capabilities mapped to those tools // If the bot "executed" tools per log but zero audit entries appeared, // it's strong evidence of hallucination / dispatcher fake. const sinceMin = Math.max( Math.ceil((Date.now() - suiteStartTs) / 60_000) + 1, 2, ); const traces = await findAnyToolCalls({ agentId: AGENT_ID, sinceMinutes: sinceMin, }); const meshTools = traces.filter((t) => /^(exec|shell\.eval|fs\.list|fs\.read|fs\.write|fs\.stat|git\.|pkg\.|proc\.|docker\.)/.test( t.toolName, ), ); if (meshTools.length === 0) { test.skip(true, "No mesh tool calls in window; nothing to cross-check."); return; } const audit = await fetchRecentAudit({ sinceSeconds: sinceMin * 60 + 30, limit: 100, }); expect( audit.length, `Bot log shows ${meshTools.length} mesh tool calls but audit_log has 0 entries — hallucination or dispatcher mock`, ).toBeGreaterThan(0); }); });