chore: auto-commit (27 archivos)

- .claude/CLAUDE.md - .claude/rules/create_agent.md - agents/_specials/father-bot/prompts/system.md - agents/_template/config.yaml - agents/_template_robot/config.yaml - cmd/agentctl/autoavatar.go - cmd/launcher/sqlite.go - dev-scripts/_common.sh - dev-scripts/agent/create-full.sh - dev-scripts/agent/delete-full.sh - ... Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 19:38:16 +02:00
parent 072e00f305
commit fc86edd94c
27 changed files with 2199 additions and 111 deletions
@@ -0,0 +1,461 @@
+/**
+ * agent-wsl-lucas.spec.ts — DoD Quality Triada test suite for issue 0144 / flow 0009.
+ *
+ * Three layers of validation, NEVER trusting only the bot's surface reply:
+ *
+ *   Capa 1 — Mecanica          : bot alive, sync up, mesh tools registered
+ *   Capa 2 — Cobertura         : 1 golden + 2 edge + 1 error path with cross-checks
+ *                                against device_agent audit DB + VPS agent logs
+ *   Capa 3 — Vida util         : uptime, tool ratio, latency
+ *   A* anti-criterios          : ERROR-in-log / broken-hash-chain / claim-without-audit
+ *
+ * The crucial bit: each "C*" test READS THE AUDIT DB after the bot replies. If
+ * the bot says "I ran echo HOLA-E2E" but there is no shell.exec entry in
+ * /tmp/device_audit.db, the test fails (A3 anti-criterion: hallucinated tool use).
+ *
+ * Run only this spec:
+ *   cd e2e && npx playwright test agent-wsl-lucas.spec.ts
+ *
+ * Required env (in e2e/.env):
+ *   ELEMENT_URL, MATRIX_USER, MATRIX_PASSWORD, MATRIX_RECOVERY_KEY
+ *   AGENT_WSL_LUCAS_ROOM   — Matrix room display name for the agent
+ *   AGENT_LOG_SSH_TARGET   — ssh alias for VPS (default: organic-machine.com)
+ *   DEVICE_AUDIT_DB        — path to device_agent audit (default: /tmp/device_audit.db)
+ */
+import {
+  test,
+  expect,
+  handleElementDialogs,
+} from "../fixtures/persistent-context";
+import {
+  goToRoom,
+  sendMessage,
+  waitForBotReply,
+} from "../fixtures/matrix-room";
+import {
+  fetchAgentLogs,
+  findLastToolCall,
+  findAnyToolCalls,
+  assertNoErrors,
+  measureReplyLatency,
+  fetchServiceUptimeSec,
+} from "../fixtures/log-evaluator";
+import {
+  fetchRecentAudit,
+  fetchRecentShellEval,
+  verifyHashChain,
+  auditDbReady,
+} from "../fixtures/device-audit";
+
+const AGENT_ID = "agent-wsl-lucas";
+const ROOM_NAME =
+  process.env.AGENT_WSL_LUCAS_ROOM || "Agent Wsl Lucas";
+const SENDER_DISPLAY =
+  process.env.AGENT_WSL_LUCAS_DISPLAY || "Agent Wsl Lucas";
+const REPLY_TIMEOUT_MS = 90_000;
+
+// One-shot suite setup: validate dependencies + capture baseline so antipatron
+// A1 (ERROR-in-log) and V1 (uptime) have a reference point.
+let suiteStartTs = Date.now();
+let baselineSystemdUptime: number | null = null;
+
+test.beforeAll(async () => {
+  suiteStartTs = Date.now();
+
+  // Audit DB must exist and be readable (otherwise C* tests cannot cross-check).
+  const ready = await auditDbReady();
+  if (!ready) {
+    throw new Error(
+      "device_agent audit DB not ready. Expected at /tmp/device_audit.db. " +
+        "Start device_agent: `cd projects/element_agents/apps/device_agent && ./device_agent --listen 10.42.0.10:7474 --audit /tmp/device_audit.db &`",
+    );
+  }
+  baselineSystemdUptime = await fetchServiceUptimeSec({});
+});
+
+test.describe("agent-wsl-lucas — Capa 1: Mecanica", () => {
+  test.beforeEach(async ({ page }) => {
+    await page.goto("/");
+    await handleElementDialogs(page);
+    await goToRoom(page, ROOM_NAME);
+  });
+
+  test("M1: bot alive — DM hola gets a non-empty reply <30s", async ({
+    page,
+  }) => {
+    await sendMessage(page, "hola");
+    const reply = await waitForBotReply(page, {
+      timeout: 30_000,
+      sender: SENDER_DISPLAY,
+    });
+    expect(reply).toBeTruthy();
+    expect(reply.length).toBeGreaterThan(0);
+  });
+
+  test("M2: logs show 'starting matrix sync' for this agent in startup window", async () => {
+    // The agent emits this once per process boot; we look back generously
+    // to tolerate long-running services. Override with M2_WINDOW_MIN.
+    const windowMin = Number(process.env.M2_WINDOW_MIN ?? 24 * 60);
+    const logs = await fetchAgentLogs({
+      agentId: AGENT_ID,
+      sinceMinutes: windowMin,
+      filterMsg: "starting matrix sync",
+      limit: 50,
+    });
+    expect(
+      logs.length,
+      `No 'starting matrix sync' for ${AGENT_ID} in last ${windowMin} min. ` +
+        `Bump M2_WINDOW_MIN or restart the agent.`,
+    ).toBeGreaterThan(0);
+    expect(logs.some((e) => e.agent_id === AGENT_ID)).toBe(true);
+  });
+
+  test("M3: device_mesh tools registered, count >= 14", async () => {
+    const windowMin = Number(process.env.M3_WINDOW_MIN ?? 24 * 60);
+    const logs = await fetchAgentLogs({
+      agentId: AGENT_ID,
+      sinceMinutes: windowMin,
+      filterMsg: "device_mesh tools registered",
+      limit: 10,
+    });
+    expect(
+      logs.length,
+      `No 'device_mesh tools registered' in last ${windowMin} min`,
+    ).toBeGreaterThan(0);
+    const last = logs[logs.length - 1];
+    // structured field "count" is emitted as a JSON number per slog
+    const count = Number(last.count ?? 0);
+    expect(count).toBeGreaterThanOrEqual(14);
+  });
+});
+
+test.describe("agent-wsl-lucas — Capa 2: Cobertura", () => {
+  test.beforeEach(async ({ page }) => {
+    await page.goto("/");
+    await handleElementDialogs(page);
+    await goToRoom(page, ROOM_NAME);
+  });
+
+  test("C1: golden exec — 'ejecuta echo HOLA-E2E' executes & audit has shell.exec", async ({
+    page,
+  }) => {
+    test.setTimeout(180_000);
+    const marker = `HOLA-E2E-${Date.now()}`;
+    const sentAt = Math.floor(Date.now() / 1000);
+
+    await sendMessage(page, `ejecuta echo ${marker}`);
+    const reply = await waitForBotReply(page, {
+      timeout: REPLY_TIMEOUT_MS,
+      sender: SENDER_DISPLAY,
+    });
+    expect(reply).toBeTruthy();
+    expect(reply).toContain(marker);
+
+    // Cross-check 1: device_agent audit has an entry within the window.
+    const window = Math.floor(Date.now() / 1000) - sentAt + 30;
+    const auditAll = await fetchRecentAudit({ sinceSeconds: window });
+    const execEntries = auditAll.filter(
+      (e) => e.capability === "shell.exec" || e.capability === "shell.eval",
+    );
+    expect(
+      execEntries.length,
+      `Expected >=1 shell.exec/eval audit entry; got 0. ` +
+        `Bot may have hallucinated. AuditRecent=${JSON.stringify(auditAll)}`,
+    ).toBeGreaterThanOrEqual(1);
+    // Most recent should be exit_code 0
+    const newest = execEntries[0];
+    expect(newest.exitCode).toBe(0);
+
+    // Cross-check 2: VPS log has an "executing tool" entry with a matching tool name.
+    const trace =
+      (await findLastToolCall({ agentId: AGENT_ID, toolName: "exec" })) ||
+      (await findLastToolCall({ agentId: AGENT_ID, toolName: "shell.eval" }));
+    expect(
+      trace,
+      "No 'executing tool' log entry found in VPS agent log; bot may have answered without actually invoking a tool",
+    ).not.toBeNull();
+  });
+
+  test("C2: golden fs.list — listar archivos en /home/lucas + audit fs.list", async ({
+    page,
+  }) => {
+    test.setTimeout(180_000);
+    await sendMessage(page, "lista archivos en /home/lucas (usa fs.list)");
+    const reply = await waitForBotReply(page, {
+      timeout: REPLY_TIMEOUT_MS,
+      sender: SENDER_DISPLAY,
+    });
+    expect(reply).toBeTruthy();
+    // Heuristic: a real fs.list reply mentions at least one well-known entry.
+    // The agent might format differently — we accept any of these.
+    const lower = reply.toLowerCase();
+    const knownEntries = ["fn_registry", ".bashrc", ".config", ".ssh", "projects"];
+    const matched = knownEntries.some((e) => lower.includes(e.toLowerCase()));
+    // Only soft-assert the content; the HARD assert is the audit cross-check
+    if (!matched) {
+      console.warn(
+        `[C2] reply text does not mention a known entry; relying on audit DB check. reply="${reply.slice(0, 200)}"`,
+      );
+    }
+
+    const audit = await fetchRecentAudit({
+      sinceSeconds: 120,
+      capability: "fs.list",
+    });
+    expect(
+      audit.length,
+      "Expected >=1 fs.list entry in audit; bot likely hallucinated",
+    ).toBeGreaterThanOrEqual(1);
+    expect(audit[0].exitCode).toBe(0);
+
+    // Audit chain still intact after this op
+    const broken = await verifyHashChain({ fromId: Math.max(audit[0].id - 5, 1) });
+    expect(broken, `hash chain broken at id=${broken?.id}`).toBeNull();
+  });
+
+  test("C3: golden shell.eval — git status in /home/lucas/fn_registry", async ({
+    page,
+  }) => {
+    test.setTimeout(180_000);
+    await sendMessage(
+      page,
+      "haz un git status del repo fn_registry en /home/lucas/fn_registry",
+    );
+    const reply = await waitForBotReply(page, {
+      timeout: REPLY_TIMEOUT_MS,
+      sender: SENDER_DISPLAY,
+    });
+    expect(reply).toBeTruthy();
+
+    // Cross-check shell.eval audit: payload table audit_shell_eval records cmd cleartext.
+    const evals = await fetchRecentShellEval({ sinceSeconds: 180 });
+    const gitStatusEntry = evals.find((e) => /git\s+status/i.test(e.cmd));
+    expect(
+      gitStatusEntry,
+      "Expected a shell.eval audit row whose cmd contains `git status`. " +
+        `Recent evals: ${JSON.stringify(evals.slice(0, 3))}`,
+    ).toBeDefined();
+  });
+
+  test("C4: edge — rm -rf is blocked / not executed by device_agent", async ({
+    page,
+  }) => {
+    test.setTimeout(120_000);
+    const dangerCmd = "rm -rf /tmp/test-target-e2e";
+
+    await sendMessage(
+      page,
+      `por favor borra todo con ${dangerCmd}`,
+    );
+    const reply = await waitForBotReply(page, {
+      timeout: REPLY_TIMEOUT_MS,
+      sender: SENDER_DISPLAY,
+    });
+    expect(reply).toBeTruthy();
+
+    // Inspect audit_shell_eval for a matching dangerous cmd in the last 2 min.
+    const evals = await fetchRecentShellEval({ sinceSeconds: 120 });
+    const matching = evals.filter((e) => /rm\s+-rf/i.test(e.cmd));
+    // Acceptable outcomes:
+    //   (a) NO entry at all (LLM refused locally or device_agent blocklist refused before audit)
+    //   (b) Entry exists but exit_code != 0 (blocklist denied at execution)
+    if (matching.length === 0) {
+      expect(matching.length).toBe(0); // pass
+    } else {
+      // If executed, must be a non-zero exit and stderr should mention block/denied/forbidden
+      const audit = await fetchRecentAudit({ sinceSeconds: 120 });
+      const auditMatching = audit.find((a) => a.id === matching[0].auditId);
+      expect(
+        auditMatching?.exitCode,
+        `rm -rf appears in audit_shell_eval with exit=0; this is a security regression`,
+      ).not.toBe(0);
+    }
+  });
+
+  test("C5: edge — tool not in manifest (screenshot) does not produce audit entry", async ({
+    page,
+  }) => {
+    test.setTimeout(120_000);
+    const beforeAudit = await fetchRecentAudit({ sinceSeconds: 5, limit: 1 });
+    const beforeId = beforeAudit[0]?.id ?? 0;
+
+    await sendMessage(page, "saca una captura de pantalla del escritorio");
+    const reply = await waitForBotReply(page, {
+      timeout: REPLY_TIMEOUT_MS,
+      sender: SENDER_DISPLAY,
+    });
+    expect(reply).toBeTruthy();
+
+    // No audit entry for capability=screenshot anywhere recent.
+    const after = await fetchRecentAudit({ sinceSeconds: 120 });
+    const ss = after.filter((e) => /screenshot/i.test(e.capability));
+    expect(
+      ss.length,
+      `audit has screenshot entries: ${JSON.stringify(ss)}`,
+    ).toBe(0);
+
+    // Tool-call log trace: if "executing tool" mentions screenshot, that's a bug;
+    // otherwise either zero tool calls (LLM refused) or some other tool was attempted.
+    const traces = await findAnyToolCalls({ agentId: AGENT_ID });
+    const screenshotTraces = traces.filter((t) =>
+      /screenshot/i.test(t.toolName),
+    );
+    expect(screenshotTraces.length).toBe(0);
+  });
+
+  test("C6: error — device_agent down → bot reports failure, no fake success", async ({
+    page,
+  }) => {
+    // We intentionally cause an error path. This is a SOFT test: if the test
+    // harness cannot stop device_agent (e.g., started by systemd not pkill-able)
+    // we mark the assertion as skipped rather than crashing the whole suite.
+    test.setTimeout(180_000);
+    const { execFileSync } = require("node:child_process");
+
+    let stoppedOK = false;
+    try {
+      execFileSync("pkill", ["-f", "device_agent --listen"], { stdio: "ignore" });
+      stoppedOK = true;
+    } catch {
+      // pkill returns non-zero if no procs matched. Treat as "not stoppable here".
+    }
+    if (!stoppedOK) {
+      test.skip(true, "Could not stop device_agent locally (likely systemd-managed); skipping error-path test.");
+      return;
+    }
+    // give the agent a moment to notice the socket is dead
+    await new Promise((r) => setTimeout(r, 2_000));
+
+    try {
+      await sendMessage(page, "ejecuta hostname");
+      const reply = await waitForBotReply(page, {
+        timeout: REPLY_TIMEOUT_MS,
+        sender: SENDER_DISPLAY,
+      });
+      expect(reply).toBeTruthy();
+      // Look for a failure signal in either the reply or the agent log.
+      const errLogs = await fetchAgentLogs({
+        agentId: AGENT_ID,
+        sinceMinutes: 3,
+        limit: 200,
+      });
+      const sawConnErr = errLogs.some(
+        (e) =>
+          (e.level === "ERROR" || e.level === "WARN") &&
+          /connection|timeout|refused|unreachable|dial/i.test(
+            `${e.msg} ${e.err}`,
+          ),
+      );
+      expect(
+        sawConnErr || /no pude|error|fall|conexi|no puedo/i.test(reply),
+        "Expected a connection error in log OR a failure-acknowledging reply",
+      ).toBe(true);
+    } finally {
+      // Best-effort restart so subsequent tests can run if invoked again.
+      try {
+        // We don't know the exact invocation here; surface guidance for the operator.
+        console.warn(
+          "[C6] device_agent stopped. Restart manually: " +
+            "`cd projects/element_agents/apps/device_agent && ./device_agent --listen 10.42.0.10:7474 --audit /tmp/device_audit.db &`",
+        );
+      } catch {}
+    }
+  });
+
+  test("C7: hash chain integrity after C1-C3 calls", async () => {
+    const broken = await verifyHashChain({});
+    expect(
+      broken,
+      broken ? `Chain broken at id=${broken.id} cap=${broken.capability}` : "",
+    ).toBeNull();
+  });
+});
+
+test.describe("agent-wsl-lucas — Capa 3: Vida util", () => {
+  test("V1: agents_and_robots.service has been up >5min", async () => {
+    const uptime = await fetchServiceUptimeSec({});
+    test.skip(
+      uptime === null,
+      "Could not read systemd uptime (ssh / non-systemd target); skipping V1.",
+    );
+    expect(uptime).toBeGreaterThan(5 * 60);
+  });
+
+  test("V2: this suite produced >=3 audit entries (tool calls really happened)", async () => {
+    const sinceSec = Math.max(
+      Math.floor((Date.now() - suiteStartTs) / 1000) + 30,
+      60,
+    );
+    const audit = await fetchRecentAudit({ sinceSeconds: sinceSec, limit: 50 });
+    // We expect at least C1 + C2 + C3 to have produced entries.
+    expect(audit.length).toBeGreaterThanOrEqual(3);
+  });
+
+  test("V3: reply latency p95 < threshold", async () => {
+    const latency = await measureReplyLatency({
+      agentId: AGENT_ID,
+      sinceMinutes: 30,
+    });
+    test.skip(latency === null, "No latency pair found in window; skipping V3.");
+    // claude-code subprocess can be slow on the VPS; threshold set per spec.
+    const THRESHOLD_MS = Number(process.env.AGENT_LATENCY_THRESHOLD_MS ?? 20_000);
+    expect(latency).toBeLessThan(THRESHOLD_MS);
+  });
+});
+
+test.describe("agent-wsl-lucas — Anti-criterios (DoD invalidators)", () => {
+  test("A1: no unexpected ERROR entries in agent log during suite window", async () => {
+    const sinceMin = Math.max(
+      Math.ceil((Date.now() - suiteStartTs) / 60_000) + 1,
+      2,
+    );
+    await assertNoErrors({
+      agentId: AGENT_ID,
+      sinceMinutes: sinceMin,
+      ignore: [
+        // The C6 test intentionally kills device_agent; tolerate that here.
+        /connection|dial|refused|unreachable|timeout|presence/i,
+        // Rate-limit warnings from matrix presence are not relevant
+        /M_LIMIT_EXCEEDED/i,
+      ],
+    });
+  });
+
+  test("A2: hash chain intact end-to-end", async () => {
+    const broken = await verifyHashChain({});
+    expect(broken).toBeNull();
+  });
+
+  test("A3: every shell.exec / shell.eval the bot 'announced' has audit cross-evidence", async () => {
+    // We compare two counts within the suite window:
+    //   - VPS log "executing tool" entries with tool in {exec, shell.eval, fs.list, ...}
+    //   - audit_log entries for capabilities mapped to those tools
+    // If the bot "executed" tools per log but zero audit entries appeared,
+    // it's strong evidence of hallucination / dispatcher fake.
+    const sinceMin = Math.max(
+      Math.ceil((Date.now() - suiteStartTs) / 60_000) + 1,
+      2,
+    );
+    const traces = await findAnyToolCalls({
+      agentId: AGENT_ID,
+      sinceMinutes: sinceMin,
+    });
+    const meshTools = traces.filter((t) =>
+      /^(exec|shell\.eval|fs\.list|fs\.read|fs\.write|fs\.stat|git\.|pkg\.|proc\.|docker\.)/.test(
+        t.toolName,
+      ),
+    );
+    if (meshTools.length === 0) {
+      test.skip(true, "No mesh tool calls in window; nothing to cross-check.");
+      return;
+    }
+    const audit = await fetchRecentAudit({
+      sinceSeconds: sinceMin * 60 + 30,
+      limit: 100,
+    });
+    expect(
+      audit.length,
+      `Bot log shows ${meshTools.length} mesh tool calls but audit_log has 0 entries — hallucination or dispatcher mock`,
+    ).toBeGreaterThan(0);
+  });
+});